Skip to content

Instantly share code, notes, and snippets.

@StrikeW
Created April 19, 2014 07:41
Show Gist options
  • Save StrikeW/11077080 to your computer and use it in GitHub Desktop.
Save StrikeW/11077080 to your computer and use it in GitHub Desktop.
A cralwer demo
#!/usr/bin/env python
# encoding: utf-8
# a cralwer demo, crawl image of "Nao can dui hua" on http://baozoumanhua.com
import urllib2
import urllib
import re
def get_all_page_urls():
url = 'http://baozoumanhua.com/duihua/hot/page/'
for i in xrange(2, 3):
yield url + str(i)
def get_webpage(url):
webpage = urllib2.urlopen(url).read()
return webpage
def get_img_and_save(webpage):
img_re = r'<a href="/articles/\d+".*>\s*<img\s*alt="(.*)"\s*src="(.*)"\s*style=.*></a>'
img_urls = re.findall(img_re, webpage)
for img in img_urls:
data = urllib2.urlopen(img[1]).read()
with open(img[0] + '.jpg', 'w') as f:
f.write(data)
if __name__ == '__main__':
# obtain web page urls
urls = get_all_page_urls()
# fetch web page content
for url in urls:
webpage = get_webpage(url)
# extract the image url
get_img_and_save(webpage)
print 'Done'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment