DrustZ · May 19, 2016 12:15
diff --git a/gistfile1.txt b/gistfile1.txt
 # -*- coding: utf-8 -*-  
 from multiprocessing.dummy import Pool as ThreadPool
 from lxml import html
 import requests
 import string

 THREADS=10

 punctuation = '!"#$%&\'()*+,-./:;<=>?[\\]^_`{|}~'
 headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

 users = []
 userf = open('users.txt','r+')
 for line in userf.readlines():
    users.append(line.rstrip())

 def scrapeQA(url):
    global headers
    try:
        qa_page = requests.get(url, headers=headers)
        qa_tree = html.fromstring(qa_page.content)
        content = qa_tree.xpath('//div[@class="resolved-cnt"] | //div[@class="other-ans-cnt"]')
        if len(content) == 0:
            return []
        return content
    except:
        return []

 def scrapeUser(userN):
    i = 0
    cnt = 0
    pre_url = 'http://wenda.so.com'
    userN = userN.split('/')[-1]
    pre_list_url = pre_url + '/u/an/' + userN + '?pn=';
    try:
        with open('n360/'+userN.translate(None, string.punctuation)+'.txt','w') as f:
            while True:
                q_url_list = [] 
                list_url = pre_list_url + `i`
                list_page = requests.get(list_url)
                list_tree = html.fromstring(list_page.content)
                items = list_tree.xpath('//li[@class="item" and @ans_id]/a')
                ansers = []
                if len(items) == 0:
                    break
                for item_a in items:
                     qurl = pre_url+item_a.get('href')
                     ansers = ansers+scrapeQA(qurl)

                for ans in ansers:
                    f.write(ans.text_content().encode('utf-8').replace('\n',' ')+'\n')
                cnt += len(items)
                print 'ready for a list, now %d items frome user %s' % (cnt , userN)
                i += 1
        f.close()
    except:
        return

 def scrapeCate(url):
    global users
    cnt = 0
    i = 1
    usrs = []
    while True:
        tar_url = url+`i`
        page = requests.get(tar_url)
        tree = html.fromstring(page.content)
        people = tree.xpath('//span[@class="name"]/a')
        if len(people) == 0:
            break
        for a in people:
            usr = a.get('href')
            if usr in users:
                continue
            usrs.append(usr)
            if len(usrs) == THREADS:
                pool = ThreadPool(processes=THREADS)
                pool.map(scrapeUser, usrs)
                pool.close()
                pool.join()
                #scrapeUser(usr)
                users += usrs
                for usr in usrs:
                    userf.write(usr+'\n');
                cnt += THREADS
                print 'ready for %d user' % cnt
                usrs = []
        i += 1

 scrapeCate('http://wenda.so.com/rank?type=5&cid=-1&pn=')
 userf.close();
	# -- coding: utf-8 --
	from multiprocessing.dummy import Pool as ThreadPool
	from lxml import html
	import requests
	import string

	THREADS=10

	punctuation = '!"#$%&\'()*+,-./:;<=>?[\\]^_`{\|}~'
	headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

	users = []
	userf = open('users.txt','r+')
	for line in userf.readlines():
	users.append(line.rstrip())

	def scrapeQA(url):
	global headers
	try:
	qa_page = requests.get(url, headers=headers)
	qa_tree = html.fromstring(qa_page.content)
	content = qa_tree.xpath('//div[@class="resolved-cnt"] \| //div[@class="other-ans-cnt"]')
	if len(content) == 0:
	return []
	return content
	except:
	return []

	def scrapeUser(userN):
	i = 0
	cnt = 0
	pre_url = 'http://wenda.so.com'
	userN = userN.split('/')[-1]
	pre_list_url = pre_url + '/u/an/' + userN + '?pn=';
	try:
	with open('n360/'+userN.translate(None, string.punctuation)+'.txt','w') as f:
	while True:
	q_url_list = []
	list_url = pre_list_url + `i`
	list_page = requests.get(list_url)
	list_tree = html.fromstring(list_page.content)
	items = list_tree.xpath('//li[@class="item" and @ans_id]/a')
	ansers = []
	if len(items) == 0:
	break
	for item_a in items:
	qurl = pre_url+item_a.get('href')
	ansers = ansers+scrapeQA(qurl)

	for ans in ansers:
	f.write(ans.text_content().encode('utf-8').replace('\n',' ')+'\n')
	cnt += len(items)
	print 'ready for a list, now %d items frome user %s' % (cnt , userN)
	i += 1
	f.close()
	except:
	return

	def scrapeCate(url):
	global users
	cnt = 0
	i = 1
	usrs = []
	while True:
	tar_url = url+`i`
	page = requests.get(tar_url)
	tree = html.fromstring(page.content)
	people = tree.xpath('//span[@class="name"]/a')
	if len(people) == 0:
	break
	for a in people:
	usr = a.get('href')
	if usr in users:
	continue
	usrs.append(usr)
	if len(usrs) == THREADS:
	pool = ThreadPool(processes=THREADS)
	pool.map(scrapeUser, usrs)
	pool.close()
	pool.join()
	#scrapeUser(usr)
	users += usrs
	for usr in usrs:
	userf.write(usr+'\n');
	cnt += THREADS
	print 'ready for %d user' % cnt
	usrs = []
	i += 1

	scrapeCate('http://wenda.so.com/rank?type=5&cid=-1&pn=')
	userf.close();