Last active
December 20, 2015 05:09
-
-
Save olsososo/6076469 to your computer and use it in GitHub Desktop.
一只简单的爬虫
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
import MySQLdb | |
import Queue | |
import re | |
import requests | |
import threading | |
import time | |
import sys | |
class WorkManager(object): | |
def __init__(self,start_url,thread_num=10): | |
self.threads = [] | |
self.__init_work_queue(start_url) | |
self.__init_thread_pool(thread_num) | |
def __init_work_queue(self,start_url): | |
try: | |
r = requests.get(start_url,timeout = 30) | |
text = r.text.encode('utf-8','//ignore') | |
p = re.compile(r'<td id="rk\d+?" class="chart">\s*?<a href="(.*?)">(.*?)</a>') | |
for m in p.finditer(text): | |
queue.put((m.group(1),m.group(2))) | |
except Exception as e: | |
print e | |
sys.exit() | |
def __init_thread_pool(self,thread_num): | |
for i in range(thread_num): | |
self.threads.append(Work()) | |
def wait_all_complete(self): | |
for item in self.threads: | |
item.join() | |
class Work(threading.Thread): | |
def __init__(self): | |
threading.Thread.__init__(self) | |
self.start() | |
def run(self): | |
while True: | |
try: | |
time.sleep(0.5) | |
print "thread:"+threading.currentThread().getName() | |
url,mobile = queue.get(block=False) | |
spider(url,mobile) | |
queue.task_done() | |
except Exception as e: | |
print e | |
break | |
def spider(url,mobile): | |
try: | |
r = requests.get(domain_name+url,timeout = 10) | |
text = r.text.encode('utf-8','//ignore') | |
p = re.compile(r'<span class="test_labels">(CPU Type|Number of Cores|Total RAM)</span></td>[\s|\S]*?<td class="altcompare">(.*?)</td>') | |
for m in p.finditer(text): | |
if m.group(1) == "CPU Type": | |
cpu = m.group(2) | |
elif m.group(1) == "Number of Cores": | |
num = m.group(2) | |
else: | |
ram = m.group(2) | |
f = open("./data.txt",'a') | |
f.write(mobile+"###"+cpu+"###"+num+"###"+ram+"\r\n") | |
f.close() | |
except Exception as e: | |
queue.put(url,mobile) | |
print e | |
def data(): | |
f = open("./data.txt",'r') | |
while True: | |
line = f.readline().strip() | |
if not line: | |
break | |
products,cpu,num,ram = line.split("###") | |
brand,product = products.split(" ",1) | |
try: | |
cur.execute("insert into benchmarks (brand,product,cpu,num,ram) values ('%s','%s','%s','%s','%s')" %(brand,product,cpu,num,ram)) | |
except Exception as e: | |
print e | |
if __name__ == "__main__": | |
queue = Queue.Queue() | |
start_url = "http://www.androidbenchmark.net/cpumark_chart.html" | |
domain_name = "http://www.androidbenchmark.net/" | |
conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='test',port=3306) | |
cur=conn.cursor() | |
wm = WorkManager(start_url) | |
wm.wait_all_complete() | |
data() | |
cur.close() | |
conn.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment