Created
September 12, 2012 12:36
-
-
Save syshack/3706331 to your computer and use it in GitHub Desktop.
lahr简历爬虫
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| #-*- coding:utf-8 -*- | |
| import urllib2 | |
| import urllib | |
| import cookielib | |
| import sys | |
| import re | |
| from multiprocessing import Process, Queue | |
| #---------------------------------------------------------------------- | |
| def login(): | |
| """Login""" | |
| #登录URL | |
| login_url = 'http://www.sh.lahr.cn/index.php?action=logincenter&logintype=login' | |
| #cookie处理器 | |
| cookies = urllib2.HTTPCookieProcessor() | |
| opener = urllib2.build_opener(cookies) | |
| urllib2.install_opener(opener) | |
| #构造header | |
| headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.60 Safari/537.1', | |
| 'Referer' : 'http://www.sh.lahr.cn/index.php?action=logincenter'} | |
| postData = {'username':'×××', | |
| 'password':'××××', | |
| 'Userkind':'2', | |
| 'submit':'登 录' | |
| } | |
| #Post数据编码 | |
| postData = urllib.urlencode(postData) | |
| #登录 | |
| request = urllib2.Request(login_url, postData, headers) | |
| response = urllib2.urlopen(request) | |
| #获取数据 | |
| #---------------------------------------------------------------------- | |
| def get_cv(url): | |
| """get_CV""" | |
| cv = urllib2.urlopen(url) | |
| cv_c=cv.read() | |
| header = cv.headers["Content-Disposition"] | |
| filename = header.split("=")[1] | |
| fp = open("cv/"+filename,"w") | |
| fp.write(cv_c) | |
| fp.close | |
| #---------------------------------------------------------------------- | |
| def make_url(): | |
| """search""" | |
| search_url ='http://www.sh.lahr.cn/index.php?action=newpersonal&extent=schbox&textboxplace=4&textboxjob=26&textboxfindpro=4&Jobyear=2&TextBoxedu=1' | |
| result = urllib2.urlopen(search_url).read() | |
| reg_max = re.compile(r'第<span class="f70 b">1/(.*?)</span> 页') | |
| maxpage = int(reg_max.findall(result)[0]) | |
| pages=[] | |
| for i in xrange(maxpage): | |
| pages.append(str(search_url)+"&page="+str(i)) | |
| urls=[] | |
| for page in pages: | |
| result = urllib2.urlopen(page).read() | |
| reg_url = re.compile(r'<a href="(.*?)" target="_blank" class="per">') | |
| urls+=reg_url.findall(result) | |
| cv_urls=[] | |
| base_url = "http://www.sh.lahr.cn/" | |
| for i in urls: | |
| cv_url = base_url+i.replace("resume","createresume") | |
| cv_urls.append(cv_url) | |
| return cv_urls | |
| #----------------------------------------------------------------------- | |
| def worker(input): | |
| """Worker""" | |
| for url in iter(input.get, 'STOP'): | |
| get_cv(url) | |
| #---------------------------------------------------------------------- | |
| def main(): | |
| """run""" | |
| task_queue = Queue() | |
| #submit tasks | |
| for url in cv_urls: | |
| task_queue.put(url) | |
| #Start worker processes | |
| for i in range(NUMBER_OF_PROCESSES): | |
| p=Process(target=worker, args=(task_queue,)) | |
| p.start() | |
| # Tell child processes to stop | |
| for i in xrange(NUMBER_OF_PROCESSES): | |
| task_queue.put('STOP') | |
| if __name__ == "__main__": | |
| login() | |
| cv_urls = make_url() | |
| NUMBER_OF_PROCESSES = 10 | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment