Created
May 26, 2014 09:08
-
-
Save muxuezi/8d7cd57710b156463d49 to your computer and use it in GitHub Desktop.
doulist with thread and queue
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import urllib2 | |
import thread | |
import Queue | |
import time | |
from bs4 import BeautifulSoup | |
def findlen(item): | |
url = 'http://dongxi.douban.com/doulists/%s/?start=0' % (item) | |
html_doc = urllib2.urlopen(url).read() | |
soup = BeautifulSoup(html_doc, from_encoding="gb18030") | |
tempage = soup.find('div', class_='paginator').text | |
lenitem = int(tempage.split()[-2]) | |
return lenitem | |
def getname_link(item, d): | |
url = 'http://dongxi.douban.com/doulists/%s/?start=%d' % (item) | |
html_doc = urllib2.urlopen(url).read() | |
soup = BeautifulSoup(html_doc, from_encoding="gb18030") | |
name_link = soup.findAll('a', class_='doulist-title') | |
for unit in name_link: | |
doulist_name_link.append((unit.text.strip(), unit.get('href'))) | |
def test(idx, name, url, allpage, dataQueue): | |
for page in allpage: | |
aurl = '%s?start=%d' % (url.split('?')[0], page) | |
html_doc = urllib2.urlopen(aurl).read() | |
soup = BeautifulSoup(html_doc, from_encoding="gb18030") | |
for itemchild in soup.findAll('li', class_='carditem card-story-large '): | |
name = itemchild.find('a').get('title').strip() | |
link = itemchild.find('a').get('href') | |
try: | |
price = itemchild.find('span', class_="commodity-price").text | |
except AttributeError: | |
price = 'None' | |
finally: | |
cons = itemchild.find('ul', class_="stats-list").text.split() | |
data = [str(idx), name, link, price] + cons | |
# print data | |
dataQueue.put(data) | |
def producer(doulist_name_link, idnum, dataQueue): | |
for msgnum in range(nummessages): | |
msgnum += idnum * nummessages | |
name, url = doulist_name_link[msgnum].split('\t') | |
url = url.strip() | |
# print url | |
html_doc = urllib2.urlopen(url).read() | |
soup = BeautifulSoup(html_doc, from_encoding="gb18030") | |
try: | |
tempage = soup.find('div', class_='paginator').text | |
except AttributeError: | |
allpage = [0] | |
else: | |
lenpage = int(tempage.split()[-2]) | |
allpage = map(lambda x: 20 * x, range(lenpage)) | |
finally: | |
# print url,allpage | |
test(msgnum, name, url, allpage, dataQueue) | |
def consumer(idnum, dataQueue): | |
while True: | |
time.sleep(0.1) | |
try: | |
data = dataQueue.get(block=False) | |
except Queue.Empty: | |
pass | |
else: | |
with safeprint: | |
with open('doutemp2.txt', 'a+') as fileout: | |
fileout.write('consumer %s got => %s+\n' % | |
(str(idnum), '\t'.join(data).encode('utf-8'))) | |
print 'consumer', idnum, 'got =>', data[0] | |
def alldoulist(): | |
doulist_name_link = [] | |
list_all = {'life': 83, 'interesting': 25, | |
'fashion': 35, 'tech': 10} # 20140525 | |
for k, v in list_all.items(): | |
v = findlen(k) | |
list_all[k] = v # update dict | |
print k, v | |
allpage = map(lambda x: 20 * x, range(v)) | |
for d in allpage: | |
getname_link(k, d) # write list name | |
with open('doulist.txt', 'w') as filelist: | |
for x in doulist_name_link: | |
filelist.write('\t'.join(x) + '\n') | |
return doulist_name_link | |
if __name__ == '__main__': | |
doulist_name_link = alldoulist() | |
numconsumers = 4 # how many consumers to start | |
numproducers = 4 # how many producers to start | |
# messages per producer to put | |
nummessages = len(doulist_name_link) / numproducers | |
safeprint = thread.allocate_lock() # else prints may overlap | |
dataQueue = Queue.Queue() # shared global, infinite size | |
with open('doutemp2.txt', 'w') as fileout: | |
fileout.write('') | |
for i in range(numconsumers): | |
thread.start_new_thread(consumer, (i, dataQueue)) | |
for i in range(numproducers): | |
thread.start_new_thread(producer, (doulist_name_link, i, dataQueue)) | |
print 'Main thread exit.' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment