chengjun · August 29, 2015 14:03
diff --git a/crowdspring_crawler.py b/crowdspring_crawler.py
 # -*- coding: utf-8 -*-
 """
 Spyder Editor

 This temporary script file is located here:
 C:\Users\chengwang6\.spyder2\.temp.py
 """
 import os
 import urllib2
 from bs4 import BeautifulSoup
 from time import clock
 from time import sleep
 from random import randint

 # Showing 31,065 closed projects with a public gallery. 1295 pages
 # aggregate pages-->thread names--->thread replies

 '''
 1.0 Get thread_names from aggregate pages
 '''

 # total_page_num = 1295
 #def get_thread_list(total_page_num,start_page):
 #    for page_num in range(start_page, total_page_num + 1):
 #        # spleeping
 #        snap_time = 0.01*randint(1,20)
 #        print "sleeping for %s seconds" % snap_time
 #        sleep(snap_time)
 #        # timing
 #        time_pass = clock() - time_start
 #        if round(time_pass) % 10 ==0:
 #            print "working for %s seconds, %d pages scanned (proportion = %.2f%%)" %(round(time_pass), page_num, page_num*100.0/total_page_num) 
 #        # page_num = 1
 #        # prepare url
 #        url1 = "http://www.crowdspring.com/browse/?page="
 #        url2 = "&status=closed&gallery=public"
 #        url = url1 + str(page_num) + url2
 #        # urlopen brawser
 #        page = urllib2.urlopen(url, timeout=10).read()
 #        soup = BeautifulSoup(page)
 #        urls = soup.find_all('td', {'class', 'project'})
 #        scores = soup.find_all('td', {'class', 'score'})
 #        entries = soup.find_all('span', {'class', 'entries'})
 #        awards = soup.find_all('td', {'class', 'award'})
 #        for i in range(len(urls)):
 #            url = urls[i].a['href']
 #            score = scores[i].span.string
 #            entry = entries[i].a.string
 #            award = awards[i].a.string
 #            print >> filesave, "%s,%s,%s,%s,%s" % (page_num, url, score, entry, award)


 #os.chdir("D:/chengjun/crowdspring/")
 #filesave= open('./crowd_thread_list1.csv', 'wb')
 #time_start = clock()
 #get_thread_list(1295,1)
 #filesave.close()



 '''
 2.0 Get replies for a given thread
 '''


 def sleeping(): # spleeping
    snap_time = 0.001*randint(1,20) #print "sleeping for %s seconds" % snap_time
    sleep(snap_time)
    
    
    
 def timing(): # timing
    time_pass = clock() - time_start
    if round(time_pass) % 10 ==0:
        print "working for %s seconds" % round(time_pass)
        
 
 # progress
 def progress():
    global project_num
    project_num += 1      
    print "%d projects scanned (proportion = %.2f%%)" %(project_num, project_num*100.0/total_project_num) 




 def scrape_entry(entry):
    # entry = entries[26]
    rating = entry.find('div', {'class', 'ratingcount'}).string
    sequence_id = entry.find('div', {'class', 'entrycount'}).string
    entry_url = entry.find_all('a')[1]['href']
    user= entry.a['href']
    # award info    
    if entry_url !='#':  
        try:
            award = entry.find('div', {'class', 'flag'}).string
        except:
            award = "NA"
        # spleeping and timing
        sleeping()
        timing()
        # get entry_time
        entry_url_full = 'http://www.crowdspring.com' + entry_url
        # try many times
        attempts = 1
        while attempts < 10:
            try:
                entry_page = urllib2.urlopen(entry_url_full, timeout=20).read()
                break
            except:
                attempts += 1
                print "another try!"
                sleeping()
        #parse the page
        entry_soup = BeautifulSoup(entry_page)
        entry_time = entry_soup.find_all('p', {'class', 'date'})[0].get_text()
    else:
        print "withdrawn!!!No entry_time information"
        entry_time = 'Withdrawn'
        award = 'NA'
    return sequence_id, entry_url, user, rating, entry_time, award




 def crawler(threads_list):
    for thread in threads_list:
        # thread = threads[2]
        entry_numbers = thread.split(',')[3]
        if entry_numbers != "0 entries":
            short_url = thread.split(',')[1]
            # short_url = '/print-design/project/2284417_pestacides-journal-being-entered-into-chemcore-magazine/'
            thread_url = 'http://www.crowdspring.com'+ short_url
            # The first page
            sleeping()
            progress()
            # try many times
            attempts = 1
            while attempts < 10:
                try:
                    thread_page = urllib2.urlopen(thread_url, timeout=20).read()
                    break
                except:
                    attempts += 1
                    print "another try!"
                    sleeping()
            # parse the page
            soup = BeautifulSoup(thread_page)
            project_by = soup.find_all('div', {'class', 'pageheader'})[0].get_text().split('project by ')[-1].strip()
            page_num = int(soup.find_all('div', {'class','paginator'})[0].span.get_text().split('of ')[1])
            entries = soup.find_all('div', {'class', 'entry-display'})
            entry_time = 0
            for entry in entries:
                try:
                    entry = scrape_entry(entry)
                    entry_time = entry[4]
                except:
                    entry = ['error', 'error', 'error', 'error']
                print >> entry_save, "%s,%s,%s,%s,%s,%s,%s,%s" % (short_url, project_by, entry[0], entry[1], entry[2], entry[3], entry[4], entry[5])
                if entry_time ==    "Withdrawn":
                    print "jump out of withdrawns"
                    break
            if page_num > 1:    
                for sub_page in range(2, page_num + 1):
                    thread_url_more = thread_url + 'page' + str(sub_page) + '/'
                    #thread_url_more="http://www.crowdspring.com/small-website/project/31658_crowdspring-homepage/page7/"
                    # try many times
                    attempts = 1
                    while attempts < 10:
                        try:                    
                            thread_page = urllib2.urlopen(thread_url_more, timeout=20).read()
                            break
                        except:
                            attempts += 1
                            print "another try!"
                            sleeping()
                    # parse the page
                    soup = BeautifulSoup(thread_page)
                    entries = soup.find_all('div', {'class', 'entry-display'})
                    for entry in entries:
                        try:
                            entry = scrape_entry(entry)
                            entry_time = entry[4]
                        except:
                            entry = ['error', 'error', 'error', 'error']
                        print >> entry_save, "%s,%s,%s,%s,%s,%s,%s,%s" % (short_url, project_by, entry[0], entry[1], entry[2], entry[3], entry[4], entry[5])
                        if entry_time ==    "Withdrawn":
                            print "jump out of withdrawns"    
                            break
            else:
                pass  
        else:
            print entry_numbers
            
        
    
 os.chdir("D:/chengjun/crowdspring/")
 thread_file = open('./crowd_thread_list63.csv', 'r')
 threads = thread_file.readlines()

 entry_save= open('./crowd_entry_list_3.csv', 'wb')
 # work
 time_start = clock()
 total_project_num = 30945
 project_num = 0

 # crawler
 crawler(threads)

 entry_save.close()
	# -- coding: utf-8 --
	"""
	Spyder Editor

	This temporary script file is located here:
	C:\Users\chengwang6\.spyder2\.temp.py
	"""
	import os
	import urllib2
	from bs4 import BeautifulSoup
	from time import clock
	from time import sleep
	from random import randint

	# Showing 31,065 closed projects with a public gallery. 1295 pages
	# aggregate pages-->thread names--->thread replies

	'''
	1.0 Get thread_names from aggregate pages
	'''

	# total_page_num = 1295
	#def get_thread_list(total_page_num,start_page):
	# for page_num in range(start_page, total_page_num + 1):
	# # spleeping
	# snap_time = 0.01*randint(1,20)
	# print "sleeping for %s seconds" % snap_time
	# sleep(snap_time)
	# # timing
	# time_pass = clock() - time_start
	# if round(time_pass) % 10 ==0:
	# print "working for %s seconds, %d pages scanned (proportion = %.2f%%)" %(round(time_pass), page_num, page_num*100.0/total_page_num)
	# # page_num = 1
	# # prepare url
	# url1 = "http://www.crowdspring.com/browse/?page="
	# url2 = "&status=closed&gallery=public"
	# url = url1 + str(page_num) + url2
	# # urlopen brawser
	# page = urllib2.urlopen(url, timeout=10).read()
	# soup = BeautifulSoup(page)
	# urls = soup.find_all('td', {'class', 'project'})
	# scores = soup.find_all('td', {'class', 'score'})
	# entries = soup.find_all('span', {'class', 'entries'})
	# awards = soup.find_all('td', {'class', 'award'})
	# for i in range(len(urls)):
	# url = urls[i].a['href']
	# score = scores[i].span.string
	# entry = entries[i].a.string
	# award = awards[i].a.string
	# print >> filesave, "%s,%s,%s,%s,%s" % (page_num, url, score, entry, award)


	#os.chdir("D:/chengjun/crowdspring/")
	#filesave= open('./crowd_thread_list1.csv', 'wb')
	#time_start = clock()
	#get_thread_list(1295,1)
	#filesave.close()



	'''
	2.0 Get replies for a given thread
	'''


	def sleeping(): # spleeping
	snap_time = 0.001*randint(1,20) #print "sleeping for %s seconds" % snap_time
	sleep(snap_time)



	def timing(): # timing
	time_pass = clock() - time_start
	if round(time_pass) % 10 ==0:
	print "working for %s seconds" % round(time_pass)


	# progress
	def progress():
	global project_num
	project_num += 1
	print "%d projects scanned (proportion = %.2f%%)" %(project_num, project_num*100.0/total_project_num)




	def scrape_entry(entry):
	# entry = entries[26]
	rating = entry.find('div', {'class', 'ratingcount'}).string
	sequence_id = entry.find('div', {'class', 'entrycount'}).string
	entry_url = entry.find_all('a')[1]['href']
	user= entry.a['href']
	# award info
	if entry_url !='#':
	try:
	award = entry.find('div', {'class', 'flag'}).string
	except:
	award = "NA"
	# spleeping and timing
	sleeping()
	timing()
	# get entry_time
	entry_url_full = 'http://www.crowdspring.com' + entry_url
	# try many times
	attempts = 1
	while attempts < 10:
	try:
	entry_page = urllib2.urlopen(entry_url_full, timeout=20).read()
	break
	except:
	attempts += 1
	print "another try!"
	sleeping()
	#parse the page
	entry_soup = BeautifulSoup(entry_page)
	entry_time = entry_soup.find_all('p', {'class', 'date'})[0].get_text()
	else:
	print "withdrawn!!!No entry_time information"
	entry_time = 'Withdrawn'
	award = 'NA'
	return sequence_id, entry_url, user, rating, entry_time, award




	def crawler(threads_list):
	for thread in threads_list:
	# thread = threads[2]
	entry_numbers = thread.split(',')[3]
	if entry_numbers != "0 entries":
	short_url = thread.split(',')[1]
	# short_url = '/print-design/project/2284417_pestacides-journal-being-entered-into-chemcore-magazine/'
	thread_url = 'http://www.crowdspring.com'+ short_url
	# The first page
	sleeping()
	progress()
	# try many times
	attempts = 1
	while attempts < 10:
	try:
	thread_page = urllib2.urlopen(thread_url, timeout=20).read()
	break
	except:
	attempts += 1
	print "another try!"
	sleeping()
	# parse the page
	soup = BeautifulSoup(thread_page)
	project_by = soup.find_all('div', {'class', 'pageheader'})[0].get_text().split('project by ')[-1].strip()
	page_num = int(soup.find_all('div', {'class','paginator'})[0].span.get_text().split('of ')[1])
	entries = soup.find_all('div', {'class', 'entry-display'})
	entry_time = 0
	for entry in entries:
	try:
	entry = scrape_entry(entry)
	entry_time = entry[4]
	except:
	entry = ['error', 'error', 'error', 'error']
	print >> entry_save, "%s,%s,%s,%s,%s,%s,%s,%s" % (short_url, project_by, entry[0], entry[1], entry[2], entry[3], entry[4], entry[5])
	if entry_time == "Withdrawn":
	print "jump out of withdrawns"
	break
	if page_num > 1:
	for sub_page in range(2, page_num + 1):
	thread_url_more = thread_url + 'page' + str(sub_page) + '/'
	#thread_url_more="http://www.crowdspring.com/small-website/project/31658_crowdspring-homepage/page7/"
	# try many times
	attempts = 1
	while attempts < 10:
	try:
	thread_page = urllib2.urlopen(thread_url_more, timeout=20).read()
	break
	except:
	attempts += 1
	print "another try!"
	sleeping()
	# parse the page
	soup = BeautifulSoup(thread_page)
	entries = soup.find_all('div', {'class', 'entry-display'})
	for entry in entries:
	try:
	entry = scrape_entry(entry)
	entry_time = entry[4]
	except:
	entry = ['error', 'error', 'error', 'error']
	print >> entry_save, "%s,%s,%s,%s,%s,%s,%s,%s" % (short_url, project_by, entry[0], entry[1], entry[2], entry[3], entry[4], entry[5])
	if entry_time == "Withdrawn":
	print "jump out of withdrawns"
	break
	else:
	pass
	else:
	print entry_numbers



	os.chdir("D:/chengjun/crowdspring/")
	thread_file = open('./crowd_thread_list63.csv', 'r')
	threads = thread_file.readlines()

	entry_save= open('./crowd_entry_list_3.csv', 'wb')
	# work
	time_start = clock()
	total_project_num = 30945
	project_num = 0

	# crawler
	crawler(threads)

	entry_save.close()