Skip to content

Instantly share code, notes, and snippets.

@tecoholic
Created January 22, 2012 15:29
Show Gist options
  • Save tecoholic/1657380 to your computer and use it in GitHub Desktop.
Save tecoholic/1657380 to your computer and use it in GitHub Desktop.
Second scrapper for wp-featured-pics
# --*-- coding:utf-8 --*--
''' Ths function collects the picture page url and the description for the
picture. The urls are taken from the pic_page_urls.list
'''
import os
import urrlib2
from BeautifulSoup import BeautifulSoup as bs
def main():
''' The main function '''
# Psuedocode
#
# create a folder to store the xmls
# open and read pic_page_urls.list
# for each url
# open the url
# create a soup
# extract the file page url and desc text
# write the data in a xml file
# Make sure the text is readable in tamil
if not os.path.isdir("xmls"):
os.makedir("xmls")
opener = urrlib2.build_opener()
opener.addheaders = [("User-agent", "Mozilla/5.0")]
urlfile = open("pic_page_urls.list", "r")
for url in urlfile.readlines():
descpage = opener.open(url)
soup = bs(descpage.read())
content = soup.find("div", {"class" : "mw-content-ltr"})
tds = content.findAll("td")
imageurl = tds[0].a["href"]
imagedesc = tds[1].text.strip()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment