2xyo · October 3, 2012 21:38
diff --git a/dl.py b/dl.py
 # imageDownloader.py
 # Finds and downloads all images from any given URL recursively.
 # FB - 201009094
 import urllib2
 from os.path import basename
 import urlparse
 from BeautifulSoup import BeautifulSoup # for HTML parsing
 global page
 page = 1
 print page
 global urlList
 urlList = []

 # recursively download images starting from the root URL
 def downloadImages(url, level, minFileSize): # the root URL is level 0
    global page
    # do not go to other websites
    global website
    netloc = urlparse.urlsplit(url).netloc.split('.')
    if netloc[-2] + netloc[-1] != website:
        return

    global urlList
    if url in urlList: # prevent using the same URL again
        return

    try:
        urlContent = urllib2.urlopen(url).read()
        urlList.append(url)
        print url
    except:
        return

    soup = BeautifulSoup(''.join(urlContent))
    # find and download all images
    imgTags = soup.findAll('img')
    for imgTag in imgTags:
        imgUrl = imgTag['src']
        # download only the proper image files
        if imgUrl.lower().endswith('.gif'):
            try:
                imgData = urllib2.urlopen(imgUrl).read()
                if len(imgData) >= minFileSize:
                    print "    " + imgUrl + " size :" + str(len(imgData)/1024)
                    fileName = basename(urlsplit(imgUrl)[2])
                    output = open(fileName,'wb')
                    output.write(imgData)
                    output.close()
            except:
                pass
    print
    print

    # if there are links on the webpage then recursively repeat
    page += 1
    linkUrl = rootUrl 
    downloadImages(linkUrl +str(page), level - 1, 2093056)

 # main
 rootUrl = 'http://memebase.cheezburger.com/senorgif/page/'
 netloc = urlparse.urlsplit(rootUrl + str(page)).netloc.split('.')
 global website
 website = netloc[-2] + netloc[-1]
 downloadImages(rootUrl, 1, 2093056)
	# imageDownloader.py
	# Finds and downloads all images from any given URL recursively.
	# FB - 201009094
	import urllib2
	from os.path import basename
	import urlparse
	from BeautifulSoup import BeautifulSoup # for HTML parsing
	global page
	page = 1
	print page
	global urlList
	urlList = []

	# recursively download images starting from the root URL
	def downloadImages(url, level, minFileSize): # the root URL is level 0
	global page
	# do not go to other websites
	global website
	netloc = urlparse.urlsplit(url).netloc.split('.')
	if netloc[-2] + netloc[-1] != website:
	return

	global urlList
	if url in urlList: # prevent using the same URL again
	return

	try:
	urlContent = urllib2.urlopen(url).read()
	urlList.append(url)
	print url
	except:
	return

	soup = BeautifulSoup(''.join(urlContent))
	# find and download all images
	imgTags = soup.findAll('img')
	for imgTag in imgTags:
	imgUrl = imgTag['src']
	# download only the proper image files
	if imgUrl.lower().endswith('.gif'):
	try:
	imgData = urllib2.urlopen(imgUrl).read()
	if len(imgData) >= minFileSize:
	print " " + imgUrl + " size :" + str(len(imgData)/1024)
	fileName = basename(urlsplit(imgUrl)[2])
	output = open(fileName,'wb')
	output.write(imgData)
	output.close()
	except:
	pass
	print
	print

	# if there are links on the webpage then recursively repeat
	page += 1
	linkUrl = rootUrl
	downloadImages(linkUrl +str(page), level - 1, 2093056)

	# main
	rootUrl = 'http://memebase.cheezburger.com/senorgif/page/'
	netloc = urlparse.urlsplit(rootUrl + str(page)).netloc.split('.')
	global website
	website = netloc[-2] + netloc[-1]
	downloadImages(rootUrl, 1, 2093056)