Skip to content

Instantly share code, notes, and snippets.

@2xyo
Created October 3, 2012 21:38
Show Gist options
  • Save 2xyo/3830027 to your computer and use it in GitHub Desktop.
Save 2xyo/3830027 to your computer and use it in GitHub Desktop.
DL gifs from senorgif..
# imageDownloader.py
# Finds and downloads all images from any given URL recursively.
# FB - 201009094
import urllib2
from os.path import basename
import urlparse
from BeautifulSoup import BeautifulSoup # for HTML parsing
global page
page = 1
print page
global urlList
urlList = []
# recursively download images starting from the root URL
def downloadImages(url, level, minFileSize): # the root URL is level 0
global page
# do not go to other websites
global website
netloc = urlparse.urlsplit(url).netloc.split('.')
if netloc[-2] + netloc[-1] != website:
return
global urlList
if url in urlList: # prevent using the same URL again
return
try:
urlContent = urllib2.urlopen(url).read()
urlList.append(url)
print url
except:
return
soup = BeautifulSoup(''.join(urlContent))
# find and download all images
imgTags = soup.findAll('img')
for imgTag in imgTags:
imgUrl = imgTag['src']
# download only the proper image files
if imgUrl.lower().endswith('.gif'):
try:
imgData = urllib2.urlopen(imgUrl).read()
if len(imgData) >= minFileSize:
print " " + imgUrl + " size :" + str(len(imgData)/1024)
fileName = basename(urlsplit(imgUrl)[2])
output = open(fileName,'wb')
output.write(imgData)
output.close()
except:
pass
print
print
# if there are links on the webpage then recursively repeat
page += 1
linkUrl = rootUrl
downloadImages(linkUrl +str(page), level - 1, 2093056)
# main
rootUrl = 'http://memebase.cheezburger.com/senorgif/page/'
netloc = urlparse.urlsplit(rootUrl + str(page)).netloc.split('.')
global website
website = netloc[-2] + netloc[-1]
downloadImages(rootUrl, 1, 2093056)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment