Last active
December 15, 2015 13:49
-
-
Save zjjott/5270366 to your computer and use it in GitHub Desktop.
python爬取diameizi网页,然后下载图片
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf-8 | |
import os | |
os.system("wget -r --spider http://diameizi.diandian.com 2>|log.txt")#非常简单的抓取整个网页树结构的语句————实质上是一种偷懒 | |
filein=open('log.txt','r') | |
fileout=open('dst','w+')#一个装最后的结果的没用的文件 | |
filelist=list(filein) | |
import urllib2,time | |
from bs4 import BeautifulSoup | |
header={ | |
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:8.0.1) Gecko/20100101 Firefox/8.0.1'} | |
def getsite(url): | |
req=urllib2.Request(url,None,header) | |
site=urllib2.urlopen(req) | |
return site.read()##上面这六句基本万金油了。。 | |
try: | |
dst=set() | |
for p in filelist: | |
if p.find('http://diameizi.diandian.com/post')>-1: | |
p=p[p.find('http'):] | |
dst.add(p) | |
i=0 | |
for p in dst: | |
#if i<191: | |
# i+=1 | |
# continue##断点续传部分 | |
pagesoup=BeautifulSoup(getsite(p)) | |
pageimg=pagesoup.find_all('img') | |
for href in pageimg: | |
print i,href['src'] | |
picpath="pic/"+href['src'][-55:-13]+href['src'][-4:]##名字的起法有问题。。。不过效果还行。。 | |
pic=getsite(href['src']) | |
picfile=open(picpath,'wb') | |
picfile.write(pic) | |
i+=1 | |
picfile.close() | |
finally: | |
for p in dst: | |
fileout.write(p) | |
fileout.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment