Skip to content

Instantly share code, notes, and snippets.

@ashfinal
Created June 3, 2017 14:18
Show Gist options
  • Save ashfinal/c6b7d21f38a2979a15fe52145417d8f9 to your computer and use it in GitHub Desktop.
Save ashfinal/c6b7d21f38a2979a15fe52145417d8f9 to your computer and use it in GitHub Desktop.
import re
import os
import urllib
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
initurl = 'http://www.03122.com/xinggan/12374/'
baseurl = os.path.split(initurl)[0]
urlpool = []
save_path = os.path.abspath("./tmp/")
if not os.path.exists(save_path):
os.mkdir(save_path)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/602.4.8 (KHTML, like Gecko) Version/10.0.3 Safari/602.4.8'}
inithtml = requests.get(initurl, headers=headers)
initsoup = BeautifulSoup(inithtml.text, 'html.parser')
totalspan = initsoup.find('span', attrs={'class':'page-ch'}).getText()
totalstr = re.findall(r'\d+', totalspan)[0]
totalnum = int(totalstr) + 1
print u"Total %s pages." %totalnum
for i in xrange(1,totalnum):
tmpstr = baseurl + '/%s.html' %i
print tmpstr
tmphl = requests.get(tmpstr, headers=headers)
tmpsoup = BeautifulSoup(tmphl.text, 'html.parser')
tmpurl = tmpsoup.find(id='ppPage').img.get('src')
imgurl = tmpurl.split('?')[0]
print imgurl
urlpool.append(imgurl)
def down_img(url):
localpath = os.path.join(save_path, url.split('/')[7])
urllib.urlretrieve(url, localpath)
print 'Download %s done!' %localpath
if len(urlpool) > 0:
pool = Pool(4)
pool.map(down_img, urlpool)
pool.close()
pool.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment