Created
January 22, 2012 15:29
-
-
Save tecoholic/1657380 to your computer and use it in GitHub Desktop.
Second scrapper for wp-featured-pics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# --*-- coding:utf-8 --*-- | |
''' Ths function collects the picture page url and the description for the | |
picture. The urls are taken from the pic_page_urls.list | |
''' | |
import os | |
import urrlib2 | |
from BeautifulSoup import BeautifulSoup as bs | |
def main(): | |
''' The main function ''' | |
# Psuedocode | |
# | |
# create a folder to store the xmls | |
# open and read pic_page_urls.list | |
# for each url | |
# open the url | |
# create a soup | |
# extract the file page url and desc text | |
# write the data in a xml file | |
# Make sure the text is readable in tamil | |
if not os.path.isdir("xmls"): | |
os.makedir("xmls") | |
opener = urrlib2.build_opener() | |
opener.addheaders = [("User-agent", "Mozilla/5.0")] | |
urlfile = open("pic_page_urls.list", "r") | |
for url in urlfile.readlines(): | |
descpage = opener.open(url) | |
soup = bs(descpage.read()) | |
content = soup.find("div", {"class" : "mw-content-ltr"}) | |
tds = content.findAll("td") | |
imageurl = tds[0].a["href"] | |
imagedesc = tds[1].text.strip() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment