jayrambhia · March 6, 2012 06:54 · jayrambhia · Mar 6, 2012
diff --git a/down_xkcd.py b/down_xkcd.py
 import urllib2
 import os
 from BeautifulSoup import BeautifulSoup

 BASE_URL = "http://xkcd.com"

 proxy = {"http":"http://user:pass@proxy:port/",
         "https":"https://user:pass@proxy:port/"}
 Proxy = urllib2.ProxyHandler(proxy)
 opener = urllib2.build_opener(Proxy)
 urllib2.install_opener(opener)
        
 def get_soup(URL):
    page = urllib2.urlopen(URL)
    soup = BeautifulSoup(page.read())
    return soup
    
 def get_next_URL(soup):
    URL = soup.find("a",{"href":True,"accesskey":"n"})["href"]
    if URL == "#":
        return None
    URL = BASE_URL+URL
    return URL
    
 def get_previous_URL(soup):
    URL = soup.find("a",{"href":True,"accesskey":"p"})["href"]
    if URL == "#":
        return None
    URL = BASE_URL+URL
    return URL

 def get_image_URL(soup):
    img_URL = soup.find("img",{"src":True,"alt":True,"title":True})["src"]
    return img_URL
    
 def save_img(img_URL, URL):
    page = opener.open(img_URL)
    if not URL.endswith("/"):
        URL=URL+"/"
    filename = "-".join([URL.split("/")[-2],img_URL.split("/")[-1]])
    f = open(os.path.join("down_xkcd",filename),"wb")
    f.write(page.read())
    f.close()
    print filename,"saved"

 def crawl(URL):
    if URL is None:
        return
    soup = get_soup(URL)
    image_URL = get_image_URL(soup)
    save_img(image_URL, URL)
    URL = get_next_URL(soup)
    if URL is None:
        return None
    crawl(URL)
            
 def main():
    dirs = os.listdir(".")
    if not "down_xkcd" in dirs:
        os.mkdir("down_xkcd")
    files = os.listdir("down_xkcd")
    img_list = []
    if files:
        for filename in files:
            img_list.append(int(filename.split("-")[0]))
        img_list.sort()
        URL = "/".join([BASE_URL,str(img_list[-1])])
    else:
        URL = "/".join([BASE_URL,"1"])
    print URL
    crawl(URL)    

 if __name__ == "__main__":
    main()
	import urllib2
	import os
	from BeautifulSoup import BeautifulSoup

	BASE_URL = "http://xkcd.com"

	proxy = {"http":"http://user:pass@proxy:port/",
	"https":"https://user:pass@proxy:port/"}
	Proxy = urllib2.ProxyHandler(proxy)
	opener = urllib2.build_opener(Proxy)
	urllib2.install_opener(opener)

	def get_soup(URL):
	page = urllib2.urlopen(URL)
	soup = BeautifulSoup(page.read())
	return soup

	def get_next_URL(soup):
	URL = soup.find("a",{"href":True,"accesskey":"n"})["href"]
	if URL == "#":
	return None
	URL = BASE_URL+URL
	return URL

	def get_previous_URL(soup):
	URL = soup.find("a",{"href":True,"accesskey":"p"})["href"]
	if URL == "#":
	return None
	URL = BASE_URL+URL
	return URL

	def get_image_URL(soup):
	img_URL = soup.find("img",{"src":True,"alt":True,"title":True})["src"]
	return img_URL

	def save_img(img_URL, URL):
	page = opener.open(img_URL)
	if not URL.endswith("/"):
	URL=URL+"/"
	filename = "-".join([URL.split("/")[-2],img_URL.split("/")[-1]])
	f = open(os.path.join("down_xkcd",filename),"wb")
	f.write(page.read())
	f.close()
	print filename,"saved"

	def crawl(URL):
	if URL is None:
	return
	soup = get_soup(URL)
	image_URL = get_image_URL(soup)
	save_img(image_URL, URL)
	URL = get_next_URL(soup)
	if URL is None:
	return None
	crawl(URL)

	def main():
	dirs = os.listdir(".")
	if not "down_xkcd" in dirs:
	os.mkdir("down_xkcd")
	files = os.listdir("down_xkcd")
	img_list = []
	if files:
	for filename in files:
	img_list.append(int(filename.split("-")[0]))
	img_list.sort()
	URL = "/".join([BASE_URL,str(img_list[-1])])
	else:
	URL = "/".join([BASE_URL,"1"])
	print URL
	crawl(URL)

	if __name__ == "__main__":
	main()
No results found