Skip to content

Instantly share code, notes, and snippets.

@zhwei
Created June 7, 2014 02:35
Show Gist options
  • Save zhwei/2b875196771db32fd9f2 to your computer and use it in GitHub Desktop.
Save zhwei/2b875196771db32fd9f2 to your computer and use it in GitHub Desktop.
抓取百姓网指定城市中二手车栏目下一百个帖子的详细内容
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import sys
import time
import threading
try:
import requests
except ImportError:
print("Please Install Package `requests`, Note: `pip install requests`")
sys.exit()
headers = {}
proxies = (
"222.66.115.233:80",
)
title_par = re.compile('<div class="viewad"><h3>(.*?)<')
info_par = re.compile('<li>(.*?)</li>')
content_par = re.compile('<div class="description">(.*?)</div>', re.M|re.S)
tel_par = re.compile('<a id="contact-number" href="tel:(.*?)"><', re.M|re.S)
def striphtml(data):
""" Remove html tags """
p = re.compile(r'<.*?>')
return p.sub('', data)
def get_page_url(city, num=0):
"""通过`num`获取分类链接"""
_url = "http://%s.baixing.com/m/ershouqiche/" % city
if num: _url += "?page=%s" % num
return _url
def safe_get(url, cookies, headers=headers, **kwargs):
"""封装get方法, 处理防抓机制"""
_req = requests.get(url, cookies=cookies, headers=headers, **kwargs)
if _req.status_code == 503:
for p in proxies:
print("Chage ip to", p)
_req = requests.get(url, cookies=cookies, headers=headers,
proxies = {"http": "http://%s" % p,}, **kwargs)
if not _req.status_code == 503:
return _req
print("Proxies Not Work, Please Wait!")
sys.exit()
else:
return _req
class FetchThread(threading.Thread):
""" Worker 线程, 处理任务
工作流程:
1. 从任务池中获取`oid`, 通过`oid`获取帖子`html`页面
2. 使用正则表达式解析`html`获取目的信息
3. 将目的信息保存为特定格式写入txt文件
"""
def __init__(self, name, city, cookies):
threading.Thread.__init__(self, name=name)
self.city = city
self.sets = city_sets[city]
self.cookies = cookies
def run(self):
"""docstring for run"""
while len(self.sets):
oid = self.sets.pop()
_url = "http://%s.baixing.com/m/ershouqiche/a%s.html" % (self.city, oid)
_req = safe_get(_url, cookies=self.cookies)
try:
title = title_par.findall(_req.text)[0]
infos = info_par.findall(_req.text)[1:]
content = content_par.findall(_req.text)[0]
tel = tel_par.findall(_req.text)[0]
except IndexError:
print("IndexError", self.city, oid)
print(len(self.sets), self.city, title, oid)
with open("%s-%s.txt" % (self.city, oid), "wt", encoding='utf-8') as fi:
fi.write("帖子原始URL:%s\n" % _url.replace("/m/", "/"))
fi.write("帖子标题:%s\n" % title)
fi.write("帖子正文:%s\n" % striphtml(content).replace("\r\n", ""))
fi.write("帖子其他信息:%s\n" % " ".join(infos))
fi.write("帖子联系方式(手机):%s\n" % tel)
class BossThread(threading.Thread):
""" Boss线程, 创建任务
工作流程:
1. 解析分类列表页面`html`获取特定数目的`oid`
2. 将获取的`oid`放入任务池
"""
def __init__(self, name, city, cookies):
threading.Thread.__init__(self, name=name)
self.city = city
self.sets = city_sets[city]
self.cookies = cookies
def run(self):
"""docstring for run"""
oid_par = re.compile('href="http://%s.baixing.com/m/ershouqiche/a(\d+).html"' % self.city)
page_num = 0
while len(self.sets) < 100:
_url = get_page_url(self.city, page_num)
page_num += 1
req = safe_get(_url, self.cookies)
oids = oid_par.findall(req.text)
for oid in oids:
if len(self.sets) < 100:
self.sets.add(oid)
print("Create %s task, Now: %s" % (self.city, len(self.sets)))
else:
print(self.city, len(self.sets), " Exiting")
# 由于存在重复ID,所以选择集合(set)作为任务池
# 抓取了三个城市: 上海、淄博、重庆
city_sets = {
"shanghai":set(),
"zibo":set(),
"chongqing":set(),
}
def main():
"""主方法,创建线程"""
# 创建任务
_threads = []
for city in city_sets:
cookies = requests.get('http://%s.baixing.com/' % city).cookies
boss = BossThread(city, city, cookies)
boss.start()
_threads.append(boss)
for i in _threads: i.join()
# 处理任务
_threads = []
for city in city_sets:
cookies = requests.get('http://%s.baixing.com/' % city).cookies
for i in range(13):
worker = FetchThread("%s-%s" % (city, i), city, cookies)
worker.start()
_threads.append(worker)
for i in _threads: i.join()
print('Threads END')
if __name__ == '__main__':
c_time = time.time()
main()
print(time.time()-c_time)
@HerringtonDarkholme
Copy link

from google search

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment