mrluanma · June 15, 2011 17:13
diff --git a/51job_hr.py b/51job_hr.py
 import os
 import errno
 import urllib
 import re
 from urlparse import urlparse
 from hashlib import md5
 import httplib2
 from lxml import html


 URL = 'http://hrclub.51job.com/hrtool/Tools.asp'
 DOWNLOAD_URL = 'http://hrclub.51job.com'


 def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc: # Python >2.5
        if exc.errno == errno.EEXIST:
            pass
        else: raise


 h = httplib2.Http('.cache')

 headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Charset": "UTF-8,*;q=0.5",
    "Accept-Encoding": "gzip,deflate,sdch",
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Host": "hrclub.51job.com",
    "Cookie": "ASPSESSIONIDACACCCQC=IHLNBDCAOPKNAIGHLDDABJFB",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.91 Safari/534.30",
 }

 last_digest = ''
 for page in range(1, 10000):
    params = {
        "BigClassID": 20,
        "SmallClassID": 0,
        "viewnum": 100,
        "page": page,
    }

    url = URL + "?" + urllib.urlencode(params)
    resp, content = h.request(url, headers=headers)

    if resp['status'] != '200':
        print url

    doc = html.fromstring(content.decode('gb18030'))

    digest = md5(html.tostring(doc.xpath('//div[@class="page"]')[0])).hexdigest()

    if digest == last_digest:
        break

    last_digest = digest

    print page, digest

    for row in doc.xpath('//table/tr[position()>1]'):
        children = row.xpath('td')

 	if len(children) != 3:
            print html.tostring(row)
            continue

        title_element, cat_element, url_element = children
        title = title_element[0].text
        cat = cat_element.text
        mkdir_p(cat)

        urls = (e.get('href') for e in url_element.xpath('a'))
        for url in urls:
            path = cat + '/' + os.path.basename(urlparse(url).path)
            resp, content = h.request(DOWNLOAD_URL + urllib.quote(url.encode('utf-8')), headers=headers)
            if resp['status'] == '200':
                print "[%s]" % resp['status'], DOWNLOAD_URL + url
                open(path, 'wb').write(content)
            elif resp['status'] != '304':
                print "[%s]" % resp['status'], DOWNLOAD_URL + url
	import os
	import errno
	import urllib
	import re
	from urlparse import urlparse
	from hashlib import md5
	import httplib2
	from lxml import html


	URL = 'http://hrclub.51job.com/hrtool/Tools.asp'
	DOWNLOAD_URL = 'http://hrclub.51job.com'


	def mkdir_p(path):
	try:
	os.makedirs(path)
	except OSError as exc: # Python >2.5
	if exc.errno == errno.EEXIST:
	pass
	else: raise


	h = httplib2.Http('.cache')

	headers = {
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Charset": "UTF-8,*;q=0.5",
	"Accept-Encoding": "gzip,deflate,sdch",
	"Accept-Language": "zh-CN,zh;q=0.8",
	"Cache-Control": "max-age=0",
	"Connection": "keep-alive",
	"Host": "hrclub.51job.com",
	"Cookie": "ASPSESSIONIDACACCCQC=IHLNBDCAOPKNAIGHLDDABJFB",
	"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.91 Safari/534.30",
	}

	last_digest = ''
	for page in range(1, 10000):
	params = {
	"BigClassID": 20,
	"SmallClassID": 0,
	"viewnum": 100,
	"page": page,
	}

	url = URL + "?" + urllib.urlencode(params)
	resp, content = h.request(url, headers=headers)

	if resp['status'] != '200':
	print url

	doc = html.fromstring(content.decode('gb18030'))

	digest = md5(html.tostring(doc.xpath('//div[@class="page"]')[0])).hexdigest()

	if digest == last_digest:
	break

	last_digest = digest

	print page, digest

	for row in doc.xpath('//table/tr[position()>1]'):
	children = row.xpath('td')

	if len(children) != 3:
	print html.tostring(row)
	continue

	title_element, cat_element, url_element = children
	title = title_element[0].text
	cat = cat_element.text
	mkdir_p(cat)

	urls = (e.get('href') for e in url_element.xpath('a'))
	for url in urls:
	path = cat + '/' + os.path.basename(urlparse(url).path)
	resp, content = h.request(DOWNLOAD_URL + urllib.quote(url.encode('utf-8')), headers=headers)
	if resp['status'] == '200':
	print "[%s]" % resp['status'], DOWNLOAD_URL + url
	open(path, 'wb').write(content)
	elif resp['status'] != '304':
	print "[%s]" % resp['status'], DOWNLOAD_URL + url