Created
June 7, 2014 02:35
-
-
Save zhwei/2b875196771db32fd9f2 to your computer and use it in GitHub Desktop.
抓取百姓网指定城市中二手车栏目下一百个帖子的详细内容
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
import sys | |
import time | |
import threading | |
try: | |
import requests | |
except ImportError: | |
print("Please Install Package `requests`, Note: `pip install requests`") | |
sys.exit() | |
headers = {} | |
proxies = ( | |
"222.66.115.233:80", | |
) | |
title_par = re.compile('<div class="viewad"><h3>(.*?)<') | |
info_par = re.compile('<li>(.*?)</li>') | |
content_par = re.compile('<div class="description">(.*?)</div>', re.M|re.S) | |
tel_par = re.compile('<a id="contact-number" href="tel:(.*?)"><', re.M|re.S) | |
def striphtml(data): | |
""" Remove html tags """ | |
p = re.compile(r'<.*?>') | |
return p.sub('', data) | |
def get_page_url(city, num=0): | |
"""通过`num`获取分类链接""" | |
_url = "http://%s.baixing.com/m/ershouqiche/" % city | |
if num: _url += "?page=%s" % num | |
return _url | |
def safe_get(url, cookies, headers=headers, **kwargs): | |
"""封装get方法, 处理防抓机制""" | |
_req = requests.get(url, cookies=cookies, headers=headers, **kwargs) | |
if _req.status_code == 503: | |
for p in proxies: | |
print("Chage ip to", p) | |
_req = requests.get(url, cookies=cookies, headers=headers, | |
proxies = {"http": "http://%s" % p,}, **kwargs) | |
if not _req.status_code == 503: | |
return _req | |
print("Proxies Not Work, Please Wait!") | |
sys.exit() | |
else: | |
return _req | |
class FetchThread(threading.Thread): | |
""" Worker 线程, 处理任务 | |
工作流程: | |
1. 从任务池中获取`oid`, 通过`oid`获取帖子`html`页面 | |
2. 使用正则表达式解析`html`获取目的信息 | |
3. 将目的信息保存为特定格式写入txt文件 | |
""" | |
def __init__(self, name, city, cookies): | |
threading.Thread.__init__(self, name=name) | |
self.city = city | |
self.sets = city_sets[city] | |
self.cookies = cookies | |
def run(self): | |
"""docstring for run""" | |
while len(self.sets): | |
oid = self.sets.pop() | |
_url = "http://%s.baixing.com/m/ershouqiche/a%s.html" % (self.city, oid) | |
_req = safe_get(_url, cookies=self.cookies) | |
try: | |
title = title_par.findall(_req.text)[0] | |
infos = info_par.findall(_req.text)[1:] | |
content = content_par.findall(_req.text)[0] | |
tel = tel_par.findall(_req.text)[0] | |
except IndexError: | |
print("IndexError", self.city, oid) | |
print(len(self.sets), self.city, title, oid) | |
with open("%s-%s.txt" % (self.city, oid), "wt", encoding='utf-8') as fi: | |
fi.write("帖子原始URL:%s\n" % _url.replace("/m/", "/")) | |
fi.write("帖子标题:%s\n" % title) | |
fi.write("帖子正文:%s\n" % striphtml(content).replace("\r\n", "")) | |
fi.write("帖子其他信息:%s\n" % " ".join(infos)) | |
fi.write("帖子联系方式(手机):%s\n" % tel) | |
class BossThread(threading.Thread): | |
""" Boss线程, 创建任务 | |
工作流程: | |
1. 解析分类列表页面`html`获取特定数目的`oid` | |
2. 将获取的`oid`放入任务池 | |
""" | |
def __init__(self, name, city, cookies): | |
threading.Thread.__init__(self, name=name) | |
self.city = city | |
self.sets = city_sets[city] | |
self.cookies = cookies | |
def run(self): | |
"""docstring for run""" | |
oid_par = re.compile('href="http://%s.baixing.com/m/ershouqiche/a(\d+).html"' % self.city) | |
page_num = 0 | |
while len(self.sets) < 100: | |
_url = get_page_url(self.city, page_num) | |
page_num += 1 | |
req = safe_get(_url, self.cookies) | |
oids = oid_par.findall(req.text) | |
for oid in oids: | |
if len(self.sets) < 100: | |
self.sets.add(oid) | |
print("Create %s task, Now: %s" % (self.city, len(self.sets))) | |
else: | |
print(self.city, len(self.sets), " Exiting") | |
# 由于存在重复ID,所以选择集合(set)作为任务池 | |
# 抓取了三个城市: 上海、淄博、重庆 | |
city_sets = { | |
"shanghai":set(), | |
"zibo":set(), | |
"chongqing":set(), | |
} | |
def main(): | |
"""主方法,创建线程""" | |
# 创建任务 | |
_threads = [] | |
for city in city_sets: | |
cookies = requests.get('http://%s.baixing.com/' % city).cookies | |
boss = BossThread(city, city, cookies) | |
boss.start() | |
_threads.append(boss) | |
for i in _threads: i.join() | |
# 处理任务 | |
_threads = [] | |
for city in city_sets: | |
cookies = requests.get('http://%s.baixing.com/' % city).cookies | |
for i in range(13): | |
worker = FetchThread("%s-%s" % (city, i), city, cookies) | |
worker.start() | |
_threads.append(worker) | |
for i in _threads: i.join() | |
print('Threads END') | |
if __name__ == '__main__': | |
c_time = time.time() | |
main() | |
print(time.time()-c_time) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Python3 Project