Skip to content

Instantly share code, notes, and snippets.

@zjjott
Last active December 15, 2015 13:49
Show Gist options
  • Save zjjott/5270366 to your computer and use it in GitHub Desktop.
Save zjjott/5270366 to your computer and use it in GitHub Desktop.
python爬取diameizi网页,然后下载图片
#coding=utf-8
import os
os.system("wget -r --spider http://diameizi.diandian.com 2>|log.txt")#非常简单的抓取整个网页树结构的语句————实质上是一种偷懒
filein=open('log.txt','r')
fileout=open('dst','w+')#一个装最后的结果的没用的文件
filelist=list(filein)
import urllib2,time
from bs4 import BeautifulSoup
header={
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:8.0.1) Gecko/20100101 Firefox/8.0.1'}
def getsite(url):
req=urllib2.Request(url,None,header)
site=urllib2.urlopen(req)
return site.read()##上面这六句基本万金油了。。
try:
dst=set()
for p in filelist:
if p.find('http://diameizi.diandian.com/post')>-1:
p=p[p.find('http'):]
dst.add(p)
i=0
for p in dst:
#if i<191:
# i+=1
# continue##断点续传部分
pagesoup=BeautifulSoup(getsite(p))
pageimg=pagesoup.find_all('img')
for href in pageimg:
print i,href['src']
picpath="pic/"+href['src'][-55:-13]+href['src'][-4:]##名字的起法有问题。。。不过效果还行。。
pic=getsite(href['src'])
picfile=open(picpath,'wb')
picfile.write(pic)
i+=1
picfile.close()
finally:
for p in dst:
fileout.write(p)
fileout.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment