samedi 27 juin 2015

How to make loop in python?

(1) Basically what I want to do is I want to extract the the links that contain this ("h3", "post-title entry-title") attributes for few pages in one website.

import urllib2
from bs4 import BeautifulSoup

# openlinkfile from texfile
links_file = open("link.txt")
links = links_file.readlines()          

for url in links:

# main page (read link from links)
htmltext = urllib2.urlopen(url).read()
soup = BeautifulSoup(htmltext)

# extract link in Links
relative_tags_to_desired_links = soup.find_all("h3", "post-title entry-title") 

for tag in relative_tags_to_desired_links:
    desired_element = tag.next_element.next_element
    print desired_element.get("href")

(2) To move to the next page, I will ask my code to read the the html codes that contain ("a","blog-pager-older-link") attributes and then will repeat the same thing as in (1)

  # page 2 (read link from links)
relative_tags_to_desired_linksp2= soup.find_all("a","blog-pager-older-link") 
for tagp2 in relative_tags_to_desired_linksp2:
    desired_elementp2 = tagp2.get("href")

    # read link from elementp2
    htmltextp2 = urllib2.urlopen(desired_elementp2).read()
    soupp2 = BeautifulSoup(htmltextp2)

    # extract link in elementp2
    relative_tags_to_desired_linksp2 = soupp2.find_all("h3", "post-title entry-title") 
    for taggingp2 in relative_tags_to_desired_linksp2:
        desired_elementp22 = taggingp2.next_element.next_element
        print desired_elementp22.get("href")

Let say I want to read 3 pages from one particular website, do I need to repeat the same codes until 3 times? How to simplify this code? I have tried few solution but still couldn't find any solution yet.

This is my code:

import urllib2
from bs4 import BeautifulSoup

# openlinkfile from texfile
links_file = open("link.txt")
links = links_file.readlines()          

for url in links:

# main page (read link from links)
htmltext = urllib2.urlopen(url).read()
soup = BeautifulSoup(htmltext)

# extract link in Links
relative_tags_to_desired_links = soup.find_all("h3", "post-title entry-title") 

for tag in relative_tags_to_desired_links:
    desired_element = tag.next_element.next_element
    print desired_element.get("href")


# page 2 (read link from links)
relative_tags_to_desired_linksp2= soup.find_all("a","blog-pager-older-link") 
for tagp2 in relative_tags_to_desired_linksp2:
    desired_elementp2 = tagp2.get("href")

    # read link from elementp2
    htmltextp2 = urllib2.urlopen(desired_elementp2).read()
    soupp2 = BeautifulSoup(htmltextp2)

    # extract link in elementp2
    relative_tags_to_desired_linksp2 = soupp2.find_all("h3", "post-title entry-title") 
    for taggingp2 in relative_tags_to_desired_linksp2:
        desired_elementp22 = taggingp2.next_element.next_element
        print desired_elementp22.get("href")

# page 3 (read link from desired_elementp2)
relative_tags_to_desired_linksp3 = soupp2.find_all("a","blog-pager-older-link") 
for tagp3 in relative_tags_to_desired_linksp3:
    desired_elementp3 = tagp3.get("href")

     # read link from desired_elementp3
    htmltextp3 = urllib2.urlopen(desired_elementp3).read()
    soupp3 = BeautifulSoup(htmltextp3)

    #extract link in desired_elementp3
    relative_tags_to_desired_linksp32 = soupp3.find_all("h3", "post-title entry-title") 
    for taggingp3 in relative_tags_to_desired_linksp32:
        desired_elementp32 = taggingp3.next_element.next_element
        print desired_elementp32.get("href")

Link in link.txt file:

http://ift.tt/1GTAa5L

Thank you.

Aucun commentaire:

Enregistrer un commentaire