Created
December 5, 2012 02:08
-
-
Save yishenggudou/4211477 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from Queue import Queue | |
import threading | |
import urllib | |
import time | |
import sys | |
from gold import Gold | |
from db import DB | |
class Baidu(threading.Thread): | |
def __init__(self, obj, queue, name): | |
threading.Thread.__init__(self) | |
self.obj = obj | |
self.queue = queue | |
self.name = name | |
self.wd = obj.getWord() | |
def run(self): | |
html = self.getHtml(self.wd) | |
result = self.findGold(html) | |
#self.obj.addItem({'online':0,'baiduhtml':html}) | |
self.obj.addItem({'online':0,'baiduhtml':''}) | |
if result: | |
self.obj.addItem({'online':1}) | |
self.findExt() | |
self.queue.put(self.obj) | |
def getHtml(self,wd): | |
wd.strip() | |
if not wd==self.wd: | |
wd = self.wd.encode('utf8') + wd | |
try: | |
#wd = wd.encode('utf-8') | |
#wd = unicode(wd, "ascii") | |
self.url = 'http://www.baidu.com?wd={kwd}'.format(kwd=wd) | |
ufile = urllib.urlopen(self.url) | |
return ufile.read() | |
except Exception as inst: | |
print inst | |
print self.url | |
exit(1) | |
def findGold(self, html): | |
return html.find('gold/community-') | |
def findExt(self): | |
ext = {'baiduleju':'百度乐居','leju':'乐居','zxfy':'最新房源','house':'房源','esf':'二手房','zf':'租房','info':'信息','allprice':'总价', | |
'price':'价钱','houseprice':'房价','roomtype':'户型','dd':'多大','area':'面积','linkman':'联系人','agent':'中介','phone':'电话', | |
'hotline':'咨询电话','freehotline':'免费咨询电话','housedesc':'房源描述','latest':'最新'} | |
for k,v in ext.iteritems(): | |
html = self.getHtml(v) | |
if self.findGold(html): | |
self.obj.addItem({k:1}) | |
return | |
def work(start, limit): | |
queue = Queue() | |
start = int(start) | |
limit = int(limit) | |
db = DB(limit) | |
threads = [] | |
rows = db.fetchData(start) | |
for r in rows: | |
if not 0<len(r[1]): | |
continue | |
k = Gold(r[0],r[1],r[2]) | |
t = Baidu(k,queue, 'A') | |
threads.append(t) | |
numthreads = len(threads) | |
for i in range(numthreads): | |
threads[i].start() | |
for i in range(numthreads): | |
threads[i].join() | |
while True: | |
item = queue.get() | |
if item is None: | |
print 'empty queue' | |
break | |
db.add(item) | |
def main(): | |
nargs = len(sys.argv) | |
if not 2<= nargs: | |
print 'Please type the start id and the limit' | |
else: | |
start = sys.argv[1] | |
limit = sys.argv[2] | |
limit = 10 | |
work(start, limit) | |
if __name__ =='__main__': | |
stime = time.ctime() | |
main() | |
etime = time.ctime() | |
print 'all job had done. cost %s' % str(etime-stime) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment