Skip to content

Instantly share code, notes, and snippets.

@myanbin
Last active December 28, 2015 13:49
Show Gist options
  • Save myanbin/7510076 to your computer and use it in GitHub Desktop.
Save myanbin/7510076 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""This Spider find the images on http://jandan.net/ooxx and
download it to local directory"""
import urllib
import urllib2
import re
# Edit this variant to name the download images
index = 1000
def getImagesByUrl(url):
global index
html = urllib2.urlopen(url).read()
regex = re.compile(r'<img src="(.+?)" />')
imgUrls = regex.findall(html)
for i in range(0, len(imgUrls)):
localpath = '/cygdrive/c/nginx-1.2.7/html/21/images/'
if imgUrls[i].endswith('.jpg') is True:
print str(index) + '.jpg ' + imgUrls[i]
localpath = localpath + str(index) + '.jpg';
elif imgUrls[i].endswith('.gif') is True:
print str(index) + '.gif ' + imgUrls[i]
localpath = localpath + str(index) + '.gif';
elif imgUrls[i].endswith('.png') is True:
print str(index) + '.png ' + imgUrls[i]
localpath = localpath + str(index) + '.png';
else:
continue
try:
urllib.urlretrieve(imgUrls[i], localpath)
print 'ok'
except:
print 'fault'
index = index + 1
if __name__ == '__main__':
for i in range(100, 150):
getImagesByUrl('http://jandan.net/ooxx/page-' + str(i) + '#comments')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment