Created
April 4, 2011 20:30
-
-
Save EntityReborn/902361 to your computer and use it in GitHub Desktop.
simple image downloader, currently used for adult content
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, os | |
import urllib2 | |
baseurl = "http://hof.voyeurweb.com/gallery/" | |
nexturlre = re.compile(r'Overview Page</a> \| <a href="([^"]*)">') | |
imgurlre = re.compile(r'<img src="([^"]*)" alt="([^"]*)" border="0" />') | |
def downloadNext(url, cat): | |
print "==================\nOpening %s"%url | |
f = urllib2.urlopen(url) | |
data = f.read() | |
print "Grabbing next url" | |
all = nexturlre.findall(data) | |
print all | |
if all: | |
nexturl = all[0] | |
else: | |
nexturl = None | |
print "Grabbing pic url" | |
picurl = imgurlre.findall(data)[0][0] | |
print "downloading", picurl | |
pic = urllib2.urlopen(picurl) | |
picfilename = picurl.split("/")[-1:][0] | |
if not os.path.exists("pix/%s"%cat): | |
os.makedirs("pix/%s"%cat) | |
if not os.path.exists("pix/%s/%s"%(cat,picfilename)): | |
picdata = pic.read() | |
print "writing %s (%d)"%(picfilename, len(picdata)) | |
picfile = open("pix/%s/%s"%(cat, picfilename), "wb") | |
picfile.write(picdata) | |
picfile.close() | |
else: | |
print "Pic already exists, skipping" | |
return nexturl if nexturl else False | |
indexre = re.compile('<a href="([^.:"]*)" class="[^"]*">[^<]*</a>') | |
f = urllib2.urlopen("http://hof.voyeurweb.com/gallery/") | |
data = f.read() | |
cats = indexre.findall(data) | |
print len(cats),"categories found." | |
firsturl = re.compile('<small><a href="([^"]*)">[^<]*</a></small>') | |
for cat in cats: | |
print "Downloading from", cat | |
catindex = "http://hof.voyeurweb.com/gallery/" + cat | |
f = urllib2.urlopen(catindex) | |
data = f.read() | |
url = firsturl.findall(data)[0] | |
print "Starting with", url | |
while url: | |
url = downloadNext(baseurl+cat+url, cat) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment