Skip to content

Instantly share code, notes, and snippets.

@yishenggudou
Created December 5, 2012 02:44
Show Gist options
  • Select an option

  • Save yishenggudou/4211667 to your computer and use it in GitHub Desktop.

Select an option

Save yishenggudou/4211667 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from Queue import Queue
import threading
import urllib
import time
import sys
from gold import Gold
from db import DB
class Baidu(threading.Thread):
def __init__(self, obj, queue, name):
threading.Thread.__init__(self)
self.obj = obj
self.queue = queue
self.name = name
self.wd = obj.getWord()
def run(self):
html = self.getHtml(self.wd)
result = self.findGold(html)
#self.obj.addItem({'online':0,'baiduhtml':html})
self.obj.addItem({'online':0,'baiduhtml':''})
if result:
self.obj.addItem({'online':1})
self.findExt()
self.queue.put(self.obj)
def getHtml(self,wd):
wd.strip()
if not wd==self.wd:
wd = self.wd + wd
try:
#wd = wd.encode('utf-8')
#wd = unicode(wd, "ascii")
self.url = u'http://www.baidu.com?wd={kwd}'.format(kwd=wd)
ufile = urllib.urlopen(self.url)
return ufile.read()
except Exception as inst:
print inst
print self.url
exit(1)
def findGold(self, html):
return html.find('gold/community-')
def findExt(self):
ext = {'baiduleju':u'百度乐居','leju':u'乐居','zxfy':u'最新房源','house':u'房源','esf':u'二手房','zf':u'租房','info':u'信息','allprice':u'总价',
'price':u'价钱','houseprice':u'房价','roomtype':u'户型','dd':u'多大','area':u'面积','linkman':u'联系人','agent':u'中介','phone':u'电话',
'hotline':u'咨询电话','freehotline':u'免费咨询电话','housedesc':u'房源描述','latest':u'最新'}
for k,v in ext.iteritems():
html = self.getHtml(v)
if self.findGold(html):
self.obj.addItem({k:1})
return
def work(start, limit):
queue = Queue()
start = int(start)
limit = int(limit)
db = DB(limit)
threads = []
rows = db.fetchData(start)
for r in rows:
if not 0<len(r[1]):
continue
k = Gold(r[0],r[1],r[2])
t = Baidu(k,queue, 'A')
threads.append(t)
numthreads = len(threads)
for i in range(numthreads):
threads[i].start()
for i in range(numthreads):
threads[i].join()
while True:
item = queue.get()
if item is None:
print 'empty queue'
break
db.add(item)
def main():
nargs = len(sys.argv)
if not 2<= nargs:
print 'Please type the start id and the limit'
else:
start = sys.argv[1]
limit = sys.argv[2]
limit = 10
work(start, limit)
if __name__ =='__main__':
stime = time.ctime()
main()
etime = time.ctime()
print 'all job had done. cost %s' % str(etime-stime)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment