Skip to content

Instantly share code, notes, and snippets.

@muxuezi
Created May 26, 2014 09:08
Show Gist options
  • Save muxuezi/8d7cd57710b156463d49 to your computer and use it in GitHub Desktop.
Save muxuezi/8d7cd57710b156463d49 to your computer and use it in GitHub Desktop.
doulist with thread and queue
# -*- coding: utf-8 -*-
import urllib2
import thread
import Queue
import time
from bs4 import BeautifulSoup
def findlen(item):
url = 'http://dongxi.douban.com/doulists/%s/?start=0' % (item)
html_doc = urllib2.urlopen(url).read()
soup = BeautifulSoup(html_doc, from_encoding="gb18030")
tempage = soup.find('div', class_='paginator').text
lenitem = int(tempage.split()[-2])
return lenitem
def getname_link(item, d):
url = 'http://dongxi.douban.com/doulists/%s/?start=%d' % (item)
html_doc = urllib2.urlopen(url).read()
soup = BeautifulSoup(html_doc, from_encoding="gb18030")
name_link = soup.findAll('a', class_='doulist-title')
for unit in name_link:
doulist_name_link.append((unit.text.strip(), unit.get('href')))
def test(idx, name, url, allpage, dataQueue):
for page in allpage:
aurl = '%s?start=%d' % (url.split('?')[0], page)
html_doc = urllib2.urlopen(aurl).read()
soup = BeautifulSoup(html_doc, from_encoding="gb18030")
for itemchild in soup.findAll('li', class_='carditem card-story-large '):
name = itemchild.find('a').get('title').strip()
link = itemchild.find('a').get('href')
try:
price = itemchild.find('span', class_="commodity-price").text
except AttributeError:
price = 'None'
finally:
cons = itemchild.find('ul', class_="stats-list").text.split()
data = [str(idx), name, link, price] + cons
# print data
dataQueue.put(data)
def producer(doulist_name_link, idnum, dataQueue):
for msgnum in range(nummessages):
msgnum += idnum * nummessages
name, url = doulist_name_link[msgnum].split('\t')
url = url.strip()
# print url
html_doc = urllib2.urlopen(url).read()
soup = BeautifulSoup(html_doc, from_encoding="gb18030")
try:
tempage = soup.find('div', class_='paginator').text
except AttributeError:
allpage = [0]
else:
lenpage = int(tempage.split()[-2])
allpage = map(lambda x: 20 * x, range(lenpage))
finally:
# print url,allpage
test(msgnum, name, url, allpage, dataQueue)
def consumer(idnum, dataQueue):
while True:
time.sleep(0.1)
try:
data = dataQueue.get(block=False)
except Queue.Empty:
pass
else:
with safeprint:
with open('doutemp2.txt', 'a+') as fileout:
fileout.write('consumer %s got => %s+\n' %
(str(idnum), '\t'.join(data).encode('utf-8')))
print 'consumer', idnum, 'got =>', data[0]
def alldoulist():
doulist_name_link = []
list_all = {'life': 83, 'interesting': 25,
'fashion': 35, 'tech': 10} # 20140525
for k, v in list_all.items():
v = findlen(k)
list_all[k] = v # update dict
print k, v
allpage = map(lambda x: 20 * x, range(v))
for d in allpage:
getname_link(k, d) # write list name
with open('doulist.txt', 'w') as filelist:
for x in doulist_name_link:
filelist.write('\t'.join(x) + '\n')
return doulist_name_link
if __name__ == '__main__':
doulist_name_link = alldoulist()
numconsumers = 4 # how many consumers to start
numproducers = 4 # how many producers to start
# messages per producer to put
nummessages = len(doulist_name_link) / numproducers
safeprint = thread.allocate_lock() # else prints may overlap
dataQueue = Queue.Queue() # shared global, infinite size
with open('doutemp2.txt', 'w') as fileout:
fileout.write('')
for i in range(numconsumers):
thread.start_new_thread(consumer, (i, dataQueue))
for i in range(numproducers):
thread.start_new_thread(producer, (doulist_name_link, i, dataQueue))
print 'Main thread exit.'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment