(1) Basically what I want to do is I want to extract the the links that contain this ("h3", "post-title entry-title") attributes for few pages in one website.
import urllib2
from bs4 import BeautifulSoup
# openlinkfile from texfile
links_file = open("link.txt")
links = links_file.readlines()
for url in links:
# main page (read link from links)
htmltext = urllib2.urlopen(url).read()
soup = BeautifulSoup(htmltext)
# extract link in Links
relative_tags_to_desired_links = soup.find_all("h3", "post-title entry-title")
for tag in relative_tags_to_desired_links:
desired_element = tag.next_element.next_element
print desired_element.get("href")
(2) To move to the next page, I will ask my code to read the the html codes that contain ("a","blog-pager-older-link") attributes and then will repeat the same thing as in (1)
# page 2 (read link from links)
relative_tags_to_desired_linksp2= soup.find_all("a","blog-pager-older-link")
for tagp2 in relative_tags_to_desired_linksp2:
desired_elementp2 = tagp2.get("href")
# read link from elementp2
htmltextp2 = urllib2.urlopen(desired_elementp2).read()
soupp2 = BeautifulSoup(htmltextp2)
# extract link in elementp2
relative_tags_to_desired_linksp2 = soupp2.find_all("h3", "post-title entry-title")
for taggingp2 in relative_tags_to_desired_linksp2:
desired_elementp22 = taggingp2.next_element.next_element
print desired_elementp22.get("href")
Let say I want to read 3 pages from one particular website, do I need to repeat the same codes until 3 times? How to simplify this code? I have tried few solution but still couldn't find any solution yet.
This is my code:
import urllib2
from bs4 import BeautifulSoup
# openlinkfile from texfile
links_file = open("link.txt")
links = links_file.readlines()
for url in links:
# main page (read link from links)
htmltext = urllib2.urlopen(url).read()
soup = BeautifulSoup(htmltext)
# extract link in Links
relative_tags_to_desired_links = soup.find_all("h3", "post-title entry-title")
for tag in relative_tags_to_desired_links:
desired_element = tag.next_element.next_element
print desired_element.get("href")
# page 2 (read link from links)
relative_tags_to_desired_linksp2= soup.find_all("a","blog-pager-older-link")
for tagp2 in relative_tags_to_desired_linksp2:
desired_elementp2 = tagp2.get("href")
# read link from elementp2
htmltextp2 = urllib2.urlopen(desired_elementp2).read()
soupp2 = BeautifulSoup(htmltextp2)
# extract link in elementp2
relative_tags_to_desired_linksp2 = soupp2.find_all("h3", "post-title entry-title")
for taggingp2 in relative_tags_to_desired_linksp2:
desired_elementp22 = taggingp2.next_element.next_element
print desired_elementp22.get("href")
# page 3 (read link from desired_elementp2)
relative_tags_to_desired_linksp3 = soupp2.find_all("a","blog-pager-older-link")
for tagp3 in relative_tags_to_desired_linksp3:
desired_elementp3 = tagp3.get("href")
# read link from desired_elementp3
htmltextp3 = urllib2.urlopen(desired_elementp3).read()
soupp3 = BeautifulSoup(htmltextp3)
#extract link in desired_elementp3
relative_tags_to_desired_linksp32 = soupp3.find_all("h3", "post-title entry-title")
for taggingp3 in relative_tags_to_desired_linksp32:
desired_elementp32 = taggingp3.next_element.next_element
print desired_elementp32.get("href")
Link in link.txt file:
http://ift.tt/1GTAa5L
Thank you.
Aucun commentaire:
Enregistrer un commentaire