Created
October 3, 2012 21:38
-
-
Save 2xyo/3830027 to your computer and use it in GitHub Desktop.
DL gifs from senorgif..
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# imageDownloader.py | |
# Finds and downloads all images from any given URL recursively. | |
# FB - 201009094 | |
import urllib2 | |
from os.path import basename | |
import urlparse | |
from BeautifulSoup import BeautifulSoup # for HTML parsing | |
global page | |
page = 1 | |
print page | |
global urlList | |
urlList = [] | |
# recursively download images starting from the root URL | |
def downloadImages(url, level, minFileSize): # the root URL is level 0 | |
global page | |
# do not go to other websites | |
global website | |
netloc = urlparse.urlsplit(url).netloc.split('.') | |
if netloc[-2] + netloc[-1] != website: | |
return | |
global urlList | |
if url in urlList: # prevent using the same URL again | |
return | |
try: | |
urlContent = urllib2.urlopen(url).read() | |
urlList.append(url) | |
print url | |
except: | |
return | |
soup = BeautifulSoup(''.join(urlContent)) | |
# find and download all images | |
imgTags = soup.findAll('img') | |
for imgTag in imgTags: | |
imgUrl = imgTag['src'] | |
# download only the proper image files | |
if imgUrl.lower().endswith('.gif'): | |
try: | |
imgData = urllib2.urlopen(imgUrl).read() | |
if len(imgData) >= minFileSize: | |
print " " + imgUrl + " size :" + str(len(imgData)/1024) | |
fileName = basename(urlsplit(imgUrl)[2]) | |
output = open(fileName,'wb') | |
output.write(imgData) | |
output.close() | |
except: | |
pass | |
# if there are links on the webpage then recursively repeat | |
page += 1 | |
linkUrl = rootUrl | |
downloadImages(linkUrl +str(page), level - 1, 2093056) | |
# main | |
rootUrl = 'http://memebase.cheezburger.com/senorgif/page/' | |
netloc = urlparse.urlsplit(rootUrl + str(page)).netloc.split('.') | |
global website | |
website = netloc[-2] + netloc[-1] | |
downloadImages(rootUrl, 1, 2093056) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment