Created
April 30, 2015 11:15
-
-
Save dodola/d35b0ad5523e402d2985 to your computer and use it in GitHub Desktop.
Topit.me网站微型爬虫
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#encoding: utf-8 | |
__author__ = 'Dodola' | |
from lxml.html import parse | |
from time import sleep,ctime | |
import time | |
import urllib2 | |
import threading | |
import contextlib | |
import os | |
import urllib | |
BASEURL = "http://www.topit.me" | |
ALBUMURL="http://www.topit.me/album/" | |
ALBUMPERURL="http://www.topit.me/album/%s?p=%s" | |
USERURL="http://www.topit.me/user/" | |
USERPERURL="http://www.topit.me/user/%s?p=%s" | |
class AppURLopener(urllib.FancyURLopener): | |
version = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)" | |
def Download(path, pageUrl): | |
#try: | |
spath = parse( pageUrl) | |
imageUrls = spath.xpath('//a[@id="item-tip"]') | |
imageUrl = imageUrls[0].attrib["href"] | |
print("正在下载%s\n"%imageUrl) | |
imageNames=imageUrl.rsplit('/') | |
imageName=imageNames[len(imageNames)-1] | |
urlopener = AppURLopener() | |
urlopener.retrieve(imageUrl, path+imageName) | |
print("保存成功:%s%s\n"%(path ,imageName)) | |
#except Exception as err: | |
# print("下载错误{0}\n".format(err)) | |
def DownloadUser(path,userid): | |
pageUrl=USERURL+userid | |
tempdir="%s%s/"%(path,time.strftime("%Y%m%d%H%M%S",time.localtime(time.time()))) | |
print(tempdir) | |
os.mkdir(tempdir) | |
print(pageUrl) | |
spath=parse(pageUrl) | |
pagecounts=spath.xpath("id('pagination')/div/a") | |
print(pagecounts) | |
print("页面总数:%s"%(int(pagecounts[len(pagecounts)-2].text_content()))) | |
if len(pagecounts)>1: | |
pagecount=int(pagecounts[len(pagecounts)-2].text_content()) | |
else: | |
pagecount=2 | |
for page in range(1,pagecount): | |
print("访问第%s页"%page) | |
DownloadPerUser(tempdir,userid,page) | |
def DownloadPerUser(path,userid,page): | |
pageUrl=USERPERURL% (userid,page) | |
pageel=parse(pageUrl) | |
imgUrls=pageel.xpath("//a[starts-with(@href,'http://www.topit.me/item/')]/@href") | |
imgUrls=set(imgUrls)#去重 | |
print("第%s页图片数%s"%(page,len(imgUrls))) | |
task_threads=[] #存储线程 | |
count=1 | |
for i in imgUrls: | |
t= threading.Thread(target=Download,args=(path,i)) | |
#Download(path,i.attrib.get("href")) | |
count=count+1 | |
task_threads.append(t) | |
for task in task_threads: | |
task.start() | |
task.join()#将线程改为1,过多线程会被封 | |
for task in task_threads: | |
task.join() #等待所有线程结束 | |
print("线程结束") | |
def DownloadAlbum(path,albumId,page=1): | |
pageUrl=ALBUMURL+albumId | |
tempdir="%s%s/"%(path,time.strftime("%Y%m%d%H%M%S",time.localtime(time.time()))) | |
print(tempdir) | |
os.mkdir(tempdir) | |
print(pageUrl) | |
spath=parse(pageUrl) | |
pagecounts=spath.xpath("id('pagination')/div/a") | |
print(pagecounts) | |
print("页面总数:%s"%(int(pagecounts[len(pagecounts)-2].text_content()))) | |
if len(pagecounts)>1: | |
pagecount=int(pagecounts[len(pagecounts)-2].text_content()) | |
else: | |
pagecount=2 | |
task_threads=[] #存储线程 | |
for page in range(page,pagecount+1): | |
print("访问第%s页"%page) | |
t= threading.Thread(target=DownloadPerAlbum,args=(tempdir,albumId,page)) | |
task_threads.append(t) | |
#DownloadPerAlbum(tempdir,albumId,page) | |
for task in task_threads: | |
task.start() | |
for task in task_threads: | |
task.join() #等待所有线程结束 | |
def DownloadPerAlbum(path,albumid,page): | |
pageUrl=ALBUMPERURL% (albumid,page) | |
pageel=parse(pageUrl) | |
imgUrls=pageel.xpath("//a[starts-with(@href,'http://www.topit.me/album/%s/item/')]/@href"%albumid) | |
imgUrls=set(imgUrls)#去重 | |
print("第%s页图片数%s"%(page,len(imgUrls))) | |
for i in imgUrls: | |
Download(path,i) | |
## task_threads=[] #存储线程 | |
## count=1 | |
## for i in imgUrls: | |
## t= threading.Thread(target=Download,args=(path,i)) | |
## count=count+1 | |
## task_threads.append(t) | |
## for task in task_threads: | |
## task.start() | |
## # task.join()#将线程改为1,过多线程会被封 | |
## for task in task_threads: | |
## task.join() #等待所有线程结束 | |
## print("线程结束") | |
## for i in imgUrls: | |
## print("===add job====") | |
## work_manager.add_job(Download,[path,i]); | |
def testMe(args): | |
print(args) | |
time.sleep(1) | |
DownloadAlbum("/Users/dodola/Documents/TopitMe/","1131077") | |
# DownloadUser("/Users/dodola/Documents/TopitMe/","3271946") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment