Skip to content

Instantly share code, notes, and snippets.

@mrluanma
Created June 15, 2011 17:13
Show Gist options
  • Save mrluanma/1027572 to your computer and use it in GitHub Desktop.
Save mrluanma/1027572 to your computer and use it in GitHub Desktop.
import os
import errno
import urllib
import re
from urlparse import urlparse
from hashlib import md5
import httplib2
from lxml import html
URL = 'http://hrclub.51job.com/hrtool/Tools.asp'
DOWNLOAD_URL = 'http://hrclub.51job.com'
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST:
pass
else: raise
h = httplib2.Http('.cache')
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Charset": "UTF-8,*;q=0.5",
"Accept-Encoding": "gzip,deflate,sdch",
"Accept-Language": "zh-CN,zh;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": "hrclub.51job.com",
"Cookie": "ASPSESSIONIDACACCCQC=IHLNBDCAOPKNAIGHLDDABJFB",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.91 Safari/534.30",
}
last_digest = ''
for page in range(1, 10000):
params = {
"BigClassID": 20,
"SmallClassID": 0,
"viewnum": 100,
"page": page,
}
url = URL + "?" + urllib.urlencode(params)
resp, content = h.request(url, headers=headers)
if resp['status'] != '200':
print url
doc = html.fromstring(content.decode('gb18030'))
digest = md5(html.tostring(doc.xpath('//div[@class="page"]')[0])).hexdigest()
if digest == last_digest:
break
last_digest = digest
print page, digest
for row in doc.xpath('//table/tr[position()>1]'):
children = row.xpath('td')
if len(children) != 3:
print html.tostring(row)
continue
title_element, cat_element, url_element = children
title = title_element[0].text
cat = cat_element.text
mkdir_p(cat)
urls = (e.get('href') for e in url_element.xpath('a'))
for url in urls:
path = cat + '/' + os.path.basename(urlparse(url).path)
resp, content = h.request(DOWNLOAD_URL + urllib.quote(url.encode('utf-8')), headers=headers)
if resp['status'] == '200':
print "[%s]" % resp['status'], DOWNLOAD_URL + url
open(path, 'wb').write(content)
elif resp['status'] != '304':
print "[%s]" % resp['status'], DOWNLOAD_URL + url
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment