-
-
Save binyuj/5270444 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #coding=utf-8 | |
| import os | |
| os.system("wget -r --spider http://diameizi.diandian.com 2>|log.txt")#非常简单的抓取整个网页树结构的语句————实质上是一种偷懒 | |
| filein=open('log.txt','r') | |
| fileout=open('dst','w+')#一个装最后的结果的没用的文件 | |
| filelist=list(filein) | |
| import urllib2,time | |
| from bs4 import BeautifulSoup | |
| header={ | |
| 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:8.0.1) Gecko/20100101 Firefox/8.0.1'} | |
| def getsite(url): | |
| req=urllib2.Request(url,None,header) | |
| site=urllib2.urlopen(req) | |
| return site.read()##上面这六句基本万金油了。。 | |
| try: | |
| dst=set() | |
| for p in filelist: | |
| if p.find('http://diameizi.diandian.com/post')>-1: | |
| p=p[p.find('http'):] | |
| dst.add(p) | |
| i=0 | |
| for p in dst: | |
| #if i<191: | |
| # i+=1 | |
| # continue##断点续传部分 | |
| pagesoup=BeautifulSoup(getsite(p)) | |
| pageimg=pagesoup.find_all('img') | |
| for href in pageimg: | |
| print i,href['src'] | |
| picpath="pic/"+href['src'][-55:-13]+href['src'][-4:]##名字的起法有问题。。。不过效果还行。。 | |
| pic=getsite(href['src']) | |
| picfile=open(picpath,'wb') | |
| picfile.write(pic) | |
| i+=1 | |
| picfile.close() | |
| finally: | |
| for p in dst: | |
| fileout.write(p) | |
| fileout.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment