Skip to content

Instantly share code, notes, and snippets.

@yishenggudou
Created December 5, 2012 02:08
Show Gist options
  • Save yishenggudou/4211477 to your computer and use it in GitHub Desktop.
Save yishenggudou/4211477 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from Queue import Queue
import threading
import urllib
import time
import sys
from gold import Gold
from db import DB
class Baidu(threading.Thread):
def __init__(self, obj, queue, name):
threading.Thread.__init__(self)
self.obj = obj
self.queue = queue
self.name = name
self.wd = obj.getWord()
def run(self):
html = self.getHtml(self.wd)
result = self.findGold(html)
#self.obj.addItem({'online':0,'baiduhtml':html})
self.obj.addItem({'online':0,'baiduhtml':''})
if result:
self.obj.addItem({'online':1})
self.findExt()
self.queue.put(self.obj)
def getHtml(self,wd):
wd.strip()
if not wd==self.wd:
wd = self.wd.encode('utf8') + wd
try:
#wd = wd.encode('utf-8')
#wd = unicode(wd, "ascii")
self.url = 'http://www.baidu.com?wd={kwd}'.format(kwd=wd)
ufile = urllib.urlopen(self.url)
return ufile.read()
except Exception as inst:
print inst
print self.url
exit(1)
def findGold(self, html):
return html.find('gold/community-')
def findExt(self):
ext = {'baiduleju':'百度乐居','leju':'乐居','zxfy':'最新房源','house':'房源','esf':'二手房','zf':'租房','info':'信息','allprice':'总价',
'price':'价钱','houseprice':'房价','roomtype':'户型','dd':'多大','area':'面积','linkman':'联系人','agent':'中介','phone':'电话',
'hotline':'咨询电话','freehotline':'免费咨询电话','housedesc':'房源描述','latest':'最新'}
for k,v in ext.iteritems():
html = self.getHtml(v)
if self.findGold(html):
self.obj.addItem({k:1})
return
def work(start, limit):
queue = Queue()
start = int(start)
limit = int(limit)
db = DB(limit)
threads = []
rows = db.fetchData(start)
for r in rows:
if not 0<len(r[1]):
continue
k = Gold(r[0],r[1],r[2])
t = Baidu(k,queue, 'A')
threads.append(t)
numthreads = len(threads)
for i in range(numthreads):
threads[i].start()
for i in range(numthreads):
threads[i].join()
while True:
item = queue.get()
if item is None:
print 'empty queue'
break
db.add(item)
def main():
nargs = len(sys.argv)
if not 2<= nargs:
print 'Please type the start id and the limit'
else:
start = sys.argv[1]
limit = sys.argv[2]
limit = 10
work(start, limit)
if __name__ =='__main__':
stime = time.ctime()
main()
etime = time.ctime()
print 'all job had done. cost %s' % str(etime-stime)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment