Skip to content

Instantly share code, notes, and snippets.

@syshack
Created September 12, 2012 12:36
Show Gist options
  • Select an option

  • Save syshack/3706331 to your computer and use it in GitHub Desktop.

Select an option

Save syshack/3706331 to your computer and use it in GitHub Desktop.
lahr简历爬虫
#!/usr/bin/python
#-*- coding:utf-8 -*-
import urllib2
import urllib
import cookielib
import sys
import re
from multiprocessing import Process, Queue
#----------------------------------------------------------------------
def login():
"""Login"""
#登录URL
login_url = 'http://www.sh.lahr.cn/index.php?action=logincenter&logintype=login'
#cookie处理器
cookies = urllib2.HTTPCookieProcessor()
opener = urllib2.build_opener(cookies)
urllib2.install_opener(opener)
#构造header
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.60 Safari/537.1',
'Referer' : 'http://www.sh.lahr.cn/index.php?action=logincenter'}
postData = {'username':'×××',
'password':'××××',
'Userkind':'2',
'submit':'登 录'
}
#Post数据编码
postData = urllib.urlencode(postData)
#登录
request = urllib2.Request(login_url, postData, headers)
response = urllib2.urlopen(request)
#获取数据
#----------------------------------------------------------------------
def get_cv(url):
"""get_CV"""
cv = urllib2.urlopen(url)
cv_c=cv.read()
header = cv.headers["Content-Disposition"]
filename = header.split("=")[1]
fp = open("cv/"+filename,"w")
fp.write(cv_c)
fp.close
#----------------------------------------------------------------------
def make_url():
"""search"""
search_url ='http://www.sh.lahr.cn/index.php?action=newpersonal&extent=schbox&textboxplace=4&textboxjob=26&textboxfindpro=4&Jobyear=2&TextBoxedu=1'
result = urllib2.urlopen(search_url).read()
reg_max = re.compile(r'第<span class="f70 b">1/(.*?)</span> 页')
maxpage = int(reg_max.findall(result)[0])
pages=[]
for i in xrange(maxpage):
pages.append(str(search_url)+"&page="+str(i))
urls=[]
for page in pages:
result = urllib2.urlopen(page).read()
reg_url = re.compile(r'<a href="(.*?)" target="_blank" class="per">')
urls+=reg_url.findall(result)
cv_urls=[]
base_url = "http://www.sh.lahr.cn/"
for i in urls:
cv_url = base_url+i.replace("resume","createresume")
cv_urls.append(cv_url)
return cv_urls
#-----------------------------------------------------------------------
def worker(input):
"""Worker"""
for url in iter(input.get, 'STOP'):
get_cv(url)
#----------------------------------------------------------------------
def main():
"""run"""
task_queue = Queue()
#submit tasks
for url in cv_urls:
task_queue.put(url)
#Start worker processes
for i in range(NUMBER_OF_PROCESSES):
p=Process(target=worker, args=(task_queue,))
p.start()
# Tell child processes to stop
for i in xrange(NUMBER_OF_PROCESSES):
task_queue.put('STOP')
if __name__ == "__main__":
login()
cv_urls = make_url()
NUMBER_OF_PROCESSES = 10
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment