Skip to content

Instantly share code, notes, and snippets.

@olsososo
Last active December 20, 2015 05:09
Show Gist options
  • Save olsososo/6076469 to your computer and use it in GitHub Desktop.
Save olsososo/6076469 to your computer and use it in GitHub Desktop.
一只简单的爬虫
# -*- coding:utf-8 -*-
import MySQLdb
import Queue
import re
import requests
import threading
import time
import sys
class WorkManager(object):
def __init__(self,start_url,thread_num=10):
self.threads = []
self.__init_work_queue(start_url)
self.__init_thread_pool(thread_num)
def __init_work_queue(self,start_url):
try:
r = requests.get(start_url,timeout = 30)
text = r.text.encode('utf-8','//ignore')
p = re.compile(r'<td id="rk\d+?" class="chart">\s*?<a href="(.*?)">(.*?)</a>')
for m in p.finditer(text):
queue.put((m.group(1),m.group(2)))
except Exception as e:
print e
sys.exit()
def __init_thread_pool(self,thread_num):
for i in range(thread_num):
self.threads.append(Work())
def wait_all_complete(self):
for item in self.threads:
item.join()
class Work(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.start()
def run(self):
while True:
try:
time.sleep(0.5)
print "thread:"+threading.currentThread().getName()
url,mobile = queue.get(block=False)
spider(url,mobile)
queue.task_done()
except Exception as e:
print e
break
def spider(url,mobile):
try:
r = requests.get(domain_name+url,timeout = 10)
text = r.text.encode('utf-8','//ignore')
p = re.compile(r'<span class="test_labels">(CPU Type|Number of Cores|Total RAM)</span></td>[\s|\S]*?<td class="altcompare">(.*?)</td>')
for m in p.finditer(text):
if m.group(1) == "CPU Type":
cpu = m.group(2)
elif m.group(1) == "Number of Cores":
num = m.group(2)
else:
ram = m.group(2)
f = open("./data.txt",'a')
f.write(mobile+"###"+cpu+"###"+num+"###"+ram+"\r\n")
f.close()
except Exception as e:
queue.put(url,mobile)
print e
def data():
f = open("./data.txt",'r')
while True:
line = f.readline().strip()
if not line:
break
products,cpu,num,ram = line.split("###")
brand,product = products.split(" ",1)
try:
cur.execute("insert into benchmarks (brand,product,cpu,num,ram) values ('%s','%s','%s','%s','%s')" %(brand,product,cpu,num,ram))
except Exception as e:
print e
if __name__ == "__main__":
queue = Queue.Queue()
start_url = "http://www.androidbenchmark.net/cpumark_chart.html"
domain_name = "http://www.androidbenchmark.net/"
conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='test',port=3306)
cur=conn.cursor()
wm = WorkManager(start_url)
wm.wait_all_complete()
data()
cur.close()
conn.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment