tecoholic · January 22, 2012 15:29
diff --git a/desc.py b/desc.py
 # --*-- coding:utf-8 --*--

 ''' Ths function collects the picture page url and the description for the
 picture. The urls are taken from the pic_page_urls.list
 '''

 import os
 import urrlib2

 from BeautifulSoup import BeautifulSoup as bs

 def main():
    ''' The main function '''
    # Psuedocode
    #
    # create a folder to store the xmls
    # open and read pic_page_urls.list
    # for  each url
    #   open the url
    #   create a soup
    #   extract the file page url and desc text
    #   write the data in a xml file
    #   Make sure the text is readable in tamil
    if not os.path.isdir("xmls"):
        os.makedir("xmls")
    opener = urrlib2.build_opener()
    opener.addheaders = [("User-agent", "Mozilla/5.0")]
    urlfile = open("pic_page_urls.list", "r")
    for url in urlfile.readlines():
        descpage = opener.open(url)
        soup = bs(descpage.read())
        content = soup.find("div", {"class" : "mw-content-ltr"})
        tds = content.findAll("td")
        imageurl = tds[0].a["href"]
        imagedesc = tds[1].text.strip()


 if __name__ == "__main__":
    main()
	# ---- coding:utf-8 ----

	''' Ths function collects the picture page url and the description for the
	picture. The urls are taken from the pic_page_urls.list
	'''

	import os
	import urrlib2

	from BeautifulSoup import BeautifulSoup as bs

	def main():
	''' The main function '''
	# Psuedocode
	#
	# create a folder to store the xmls
	# open and read pic_page_urls.list
	# for each url
	# open the url
	# create a soup
	# extract the file page url and desc text
	# write the data in a xml file
	# Make sure the text is readable in tamil
	if not os.path.isdir("xmls"):
	os.makedir("xmls")
	opener = urrlib2.build_opener()
	opener.addheaders = [("User-agent", "Mozilla/5.0")]
	urlfile = open("pic_page_urls.list", "r")
	for url in urlfile.readlines():
	descpage = opener.open(url)
	soup = bs(descpage.read())
	content = soup.find("div", {"class" : "mw-content-ltr"})
	tds = content.findAll("td")
	imageurl = tds[0].a["href"]
	imagedesc = tds[1].text.strip()


	if __name__ == "__main__":
	main()