Skip to content

Instantly share code, notes, and snippets.

@syshack
Created September 13, 2012 03:45
Show Gist options
  • Select an option

  • Save syshack/3711707 to your computer and use it in GitHub Desktop.

Select an option

Save syshack/3711707 to your computer and use it in GitHub Desktop.
多进程虫子
#!/usr/bin/python
#-*- coding:utf-8 -*-
import urllib2
import urllib
import cookielib
import sys
import re
from multiprocessing import Process, Queue
#----------------------------------------------------------------------
def login():
"""Login"""
#login URL
login_url = 'http://www.sh.lahr.cn/index.php?action=logincenter&logintype=login'
#cookie
cookies = urllib2.HTTPCookieProcessor()
opener = urllib2.build_opener(cookies)
urllib2.install_opener(opener)
#build header
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.60 Safari/537.1',
'Referer' : 'http://www.sh.lahr.cn/index.php?action=logincenter'}
postData = {'username':'****',
'password':'****',
'Userkind':'2',
'submit':'登 录'
}
#encod PostData
postData = urllib.urlencode(postData)
#login
request = urllib2.Request(login_url, postData, headers)
response = urllib2.urlopen(request)
#----------------------------------------------------------------------
def get_cv(url):
"""get_CV"""
cv = urllib2.urlopen(url)
cv_c=cv.read()
header = cv.headers["Content-Disposition"]
filename = header.split("=")[1]
fp = open("cv/"+filename,"w")
fp.write(cv_c)
fp.close
#----------------------------------------------------------------------
def make_url(page):
"""make url"""
urls=[]
try:
print "GET\t"+page
result = urllib2.urlopen(page).read()
reg_url = re.compile(r'<a href="(.*?)" target="_blank" class="per">')
urls+=reg_url.findall(result)
except:
print "error"
pass
base_url = "http://www.sh.lahr.cn/"
cv_urls=[]
for i in urls:
cv_url = base_url+i.replace("resume","createresume")
cv_urls.append(cv_url)
return cv_urls
#----------------------------------------------------------------------
def url_maker(input_que,output_que):
"""url maker"""
#从pages_que队列拿page url并调用make_url获简历url数组cv_urls
for page in iter(input_que.get,'OK'):
cv_urls = make_url(page)
print cv_urls
#遍历cv_urls并加入urls_que队列
for i in cv_urls:
output_que.put(i)
print output_que.qsize()
#-----------------------------------------------------------------------
def worker(input):
"""Worker"""
#从队列拿简历url,并下载
for url in iter(input.get, 'STOP'):
get_cv(url)
#----------------------------------------------------------------------
#----------------------------------------------------------------------
def search():
"""search"""
#一级页面队列
pages = Queue()
urls = Queue()
search_url ='http://www.sh.lahr.cn/index.php?action=newpersonal&extent=schbox&textboxplace=4&textboxjob=26&textboxfindpro=4&Jobyear=2&TextBoxedu=1'
result = urllib2.urlopen(search_url).read()
#获取总page页数
reg_max = re.compile(r'第<span class="f70 b">1/(.*?)</span> 页')
maxpage = int(reg_max.findall(result)[0])
print maxpage
#拼接一级页面URL
for i in xrange(1,5):
#URL加入队列
pages.put(str(search_url)+"&page="+str(i))
#打印pages que的长度
print int(pages.qsize())
#创建工作进程
for i in range(NUMBER_OF_PROCESSES):
p1=Process(target=url_maker, args=(pages,urls))
p1.start()
# Tell child processes to stop
for i in xrange(NUMBER_OF_PROCESSES):
pages.put('OK')
return urls
def download(task_que):
"""run"""
#Start worker processes
for i in range(NUMBER_OF_PROCESSES):
p=Process(target=worker, args=(task_que,))
p.start()
p.join()
# Tell child processes to stop
for i in xrange(NUMBER_OF_PROCESSES):
task_que.put('STOP')
if __name__ == "__main__":
login()
NUMBER_OF_PROCESSES = 10
#调用search 获得 urls_que
task_que = search()
# 打印待下载简历数目
print task_que.qsize()
download(task_que)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment