Created
September 13, 2012 03:45
-
-
Save syshack/3711707 to your computer and use it in GitHub Desktop.
多进程虫子
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| #-*- coding:utf-8 -*- | |
| import urllib2 | |
| import urllib | |
| import cookielib | |
| import sys | |
| import re | |
| from multiprocessing import Process, Queue | |
| #---------------------------------------------------------------------- | |
| def login(): | |
| """Login""" | |
| #login URL | |
| login_url = 'http://www.sh.lahr.cn/index.php?action=logincenter&logintype=login' | |
| #cookie | |
| cookies = urllib2.HTTPCookieProcessor() | |
| opener = urllib2.build_opener(cookies) | |
| urllib2.install_opener(opener) | |
| #build header | |
| headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.60 Safari/537.1', | |
| 'Referer' : 'http://www.sh.lahr.cn/index.php?action=logincenter'} | |
| postData = {'username':'****', | |
| 'password':'****', | |
| 'Userkind':'2', | |
| 'submit':'登 录' | |
| } | |
| #encod PostData | |
| postData = urllib.urlencode(postData) | |
| #login | |
| request = urllib2.Request(login_url, postData, headers) | |
| response = urllib2.urlopen(request) | |
| #---------------------------------------------------------------------- | |
| def get_cv(url): | |
| """get_CV""" | |
| cv = urllib2.urlopen(url) | |
| cv_c=cv.read() | |
| header = cv.headers["Content-Disposition"] | |
| filename = header.split("=")[1] | |
| fp = open("cv/"+filename,"w") | |
| fp.write(cv_c) | |
| fp.close | |
| #---------------------------------------------------------------------- | |
| def make_url(page): | |
| """make url""" | |
| urls=[] | |
| try: | |
| print "GET\t"+page | |
| result = urllib2.urlopen(page).read() | |
| reg_url = re.compile(r'<a href="(.*?)" target="_blank" class="per">') | |
| urls+=reg_url.findall(result) | |
| except: | |
| print "error" | |
| pass | |
| base_url = "http://www.sh.lahr.cn/" | |
| cv_urls=[] | |
| for i in urls: | |
| cv_url = base_url+i.replace("resume","createresume") | |
| cv_urls.append(cv_url) | |
| return cv_urls | |
| #---------------------------------------------------------------------- | |
| def url_maker(input_que,output_que): | |
| """url maker""" | |
| #从pages_que队列拿page url并调用make_url获简历url数组cv_urls | |
| for page in iter(input_que.get,'OK'): | |
| cv_urls = make_url(page) | |
| print cv_urls | |
| #遍历cv_urls并加入urls_que队列 | |
| for i in cv_urls: | |
| output_que.put(i) | |
| print output_que.qsize() | |
| #----------------------------------------------------------------------- | |
| def worker(input): | |
| """Worker""" | |
| #从队列拿简历url,并下载 | |
| for url in iter(input.get, 'STOP'): | |
| get_cv(url) | |
| #---------------------------------------------------------------------- | |
| #---------------------------------------------------------------------- | |
| def search(): | |
| """search""" | |
| #一级页面队列 | |
| pages = Queue() | |
| urls = Queue() | |
| search_url ='http://www.sh.lahr.cn/index.php?action=newpersonal&extent=schbox&textboxplace=4&textboxjob=26&textboxfindpro=4&Jobyear=2&TextBoxedu=1' | |
| result = urllib2.urlopen(search_url).read() | |
| #获取总page页数 | |
| reg_max = re.compile(r'第<span class="f70 b">1/(.*?)</span> 页') | |
| maxpage = int(reg_max.findall(result)[0]) | |
| print maxpage | |
| #拼接一级页面URL | |
| for i in xrange(1,5): | |
| #URL加入队列 | |
| pages.put(str(search_url)+"&page="+str(i)) | |
| #打印pages que的长度 | |
| print int(pages.qsize()) | |
| #创建工作进程 | |
| for i in range(NUMBER_OF_PROCESSES): | |
| p1=Process(target=url_maker, args=(pages,urls)) | |
| p1.start() | |
| # Tell child processes to stop | |
| for i in xrange(NUMBER_OF_PROCESSES): | |
| pages.put('OK') | |
| return urls | |
| def download(task_que): | |
| """run""" | |
| #Start worker processes | |
| for i in range(NUMBER_OF_PROCESSES): | |
| p=Process(target=worker, args=(task_que,)) | |
| p.start() | |
| p.join() | |
| # Tell child processes to stop | |
| for i in xrange(NUMBER_OF_PROCESSES): | |
| task_que.put('STOP') | |
| if __name__ == "__main__": | |
| login() | |
| NUMBER_OF_PROCESSES = 10 | |
| #调用search 获得 urls_que | |
| task_que = search() | |
| # 打印待下载简历数目 | |
| print task_que.qsize() | |
| download(task_que) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment