Last active
August 29, 2015 14:03
-
-
Save chengjun/4a8a1002c965da24cc20 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """ | |
| Spyder Editor | |
| This temporary script file is located here: | |
| C:\Users\chengwang6\.spyder2\.temp.py | |
| """ | |
| import os | |
| import urllib2 | |
| from bs4 import BeautifulSoup | |
| from time import clock | |
| from time import sleep | |
| from random import randint | |
| # Showing 31,065 closed projects with a public gallery. 1295 pages | |
| # aggregate pages-->thread names--->thread replies | |
| ''' | |
| 1.0 Get thread_names from aggregate pages | |
| ''' | |
| # total_page_num = 1295 | |
| #def get_thread_list(total_page_num,start_page): | |
| # for page_num in range(start_page, total_page_num + 1): | |
| # # spleeping | |
| # snap_time = 0.01*randint(1,20) | |
| # print "sleeping for %s seconds" % snap_time | |
| # sleep(snap_time) | |
| # # timing | |
| # time_pass = clock() - time_start | |
| # if round(time_pass) % 10 ==0: | |
| # print "working for %s seconds, %d pages scanned (proportion = %.2f%%)" %(round(time_pass), page_num, page_num*100.0/total_page_num) | |
| # # page_num = 1 | |
| # # prepare url | |
| # url1 = "http://www.crowdspring.com/browse/?page=" | |
| # url2 = "&status=closed&gallery=public" | |
| # url = url1 + str(page_num) + url2 | |
| # # urlopen brawser | |
| # page = urllib2.urlopen(url, timeout=10).read() | |
| # soup = BeautifulSoup(page) | |
| # urls = soup.find_all('td', {'class', 'project'}) | |
| # scores = soup.find_all('td', {'class', 'score'}) | |
| # entries = soup.find_all('span', {'class', 'entries'}) | |
| # awards = soup.find_all('td', {'class', 'award'}) | |
| # for i in range(len(urls)): | |
| # url = urls[i].a['href'] | |
| # score = scores[i].span.string | |
| # entry = entries[i].a.string | |
| # award = awards[i].a.string | |
| # print >> filesave, "%s,%s,%s,%s,%s" % (page_num, url, score, entry, award) | |
| #os.chdir("D:/chengjun/crowdspring/") | |
| #filesave= open('./crowd_thread_list1.csv', 'wb') | |
| #time_start = clock() | |
| #get_thread_list(1295,1) | |
| #filesave.close() | |
| ''' | |
| 2.0 Get replies for a given thread | |
| ''' | |
| def sleeping(): # spleeping | |
| snap_time = 0.001*randint(1,20) #print "sleeping for %s seconds" % snap_time | |
| sleep(snap_time) | |
| def timing(): # timing | |
| time_pass = clock() - time_start | |
| if round(time_pass) % 10 ==0: | |
| print "working for %s seconds" % round(time_pass) | |
| # progress | |
| def progress(): | |
| global project_num | |
| project_num += 1 | |
| print "%d projects scanned (proportion = %.2f%%)" %(project_num, project_num*100.0/total_project_num) | |
| def scrape_entry(entry): | |
| # entry = entries[26] | |
| rating = entry.find('div', {'class', 'ratingcount'}).string | |
| sequence_id = entry.find('div', {'class', 'entrycount'}).string | |
| entry_url = entry.find_all('a')[1]['href'] | |
| user= entry.a['href'] | |
| # award info | |
| if entry_url !='#': | |
| try: | |
| award = entry.find('div', {'class', 'flag'}).string | |
| except: | |
| award = "NA" | |
| # spleeping and timing | |
| sleeping() | |
| timing() | |
| # get entry_time | |
| entry_url_full = 'http://www.crowdspring.com' + entry_url | |
| # try many times | |
| attempts = 1 | |
| while attempts < 10: | |
| try: | |
| entry_page = urllib2.urlopen(entry_url_full, timeout=20).read() | |
| break | |
| except: | |
| attempts += 1 | |
| print "another try!" | |
| sleeping() | |
| #parse the page | |
| entry_soup = BeautifulSoup(entry_page) | |
| entry_time = entry_soup.find_all('p', {'class', 'date'})[0].get_text() | |
| else: | |
| print "withdrawn!!!No entry_time information" | |
| entry_time = 'Withdrawn' | |
| award = 'NA' | |
| return sequence_id, entry_url, user, rating, entry_time, award | |
| def crawler(threads_list): | |
| for thread in threads_list: | |
| # thread = threads[2] | |
| entry_numbers = thread.split(',')[3] | |
| if entry_numbers != "0 entries": | |
| short_url = thread.split(',')[1] | |
| # short_url = '/print-design/project/2284417_pestacides-journal-being-entered-into-chemcore-magazine/' | |
| thread_url = 'http://www.crowdspring.com'+ short_url | |
| # The first page | |
| sleeping() | |
| progress() | |
| # try many times | |
| attempts = 1 | |
| while attempts < 10: | |
| try: | |
| thread_page = urllib2.urlopen(thread_url, timeout=20).read() | |
| break | |
| except: | |
| attempts += 1 | |
| print "another try!" | |
| sleeping() | |
| # parse the page | |
| soup = BeautifulSoup(thread_page) | |
| project_by = soup.find_all('div', {'class', 'pageheader'})[0].get_text().split('project by ')[-1].strip() | |
| page_num = int(soup.find_all('div', {'class','paginator'})[0].span.get_text().split('of ')[1]) | |
| entries = soup.find_all('div', {'class', 'entry-display'}) | |
| entry_time = 0 | |
| for entry in entries: | |
| try: | |
| entry = scrape_entry(entry) | |
| entry_time = entry[4] | |
| except: | |
| entry = ['error', 'error', 'error', 'error'] | |
| print >> entry_save, "%s,%s,%s,%s,%s,%s,%s,%s" % (short_url, project_by, entry[0], entry[1], entry[2], entry[3], entry[4], entry[5]) | |
| if entry_time == "Withdrawn": | |
| print "jump out of withdrawns" | |
| break | |
| if page_num > 1: | |
| for sub_page in range(2, page_num + 1): | |
| thread_url_more = thread_url + 'page' + str(sub_page) + '/' | |
| #thread_url_more="http://www.crowdspring.com/small-website/project/31658_crowdspring-homepage/page7/" | |
| # try many times | |
| attempts = 1 | |
| while attempts < 10: | |
| try: | |
| thread_page = urllib2.urlopen(thread_url_more, timeout=20).read() | |
| break | |
| except: | |
| attempts += 1 | |
| print "another try!" | |
| sleeping() | |
| # parse the page | |
| soup = BeautifulSoup(thread_page) | |
| entries = soup.find_all('div', {'class', 'entry-display'}) | |
| for entry in entries: | |
| try: | |
| entry = scrape_entry(entry) | |
| entry_time = entry[4] | |
| except: | |
| entry = ['error', 'error', 'error', 'error'] | |
| print >> entry_save, "%s,%s,%s,%s,%s,%s,%s,%s" % (short_url, project_by, entry[0], entry[1], entry[2], entry[3], entry[4], entry[5]) | |
| if entry_time == "Withdrawn": | |
| print "jump out of withdrawns" | |
| break | |
| else: | |
| pass | |
| else: | |
| print entry_numbers | |
| os.chdir("D:/chengjun/crowdspring/") | |
| thread_file = open('./crowd_thread_list63.csv', 'r') | |
| threads = thread_file.readlines() | |
| entry_save= open('./crowd_entry_list_3.csv', 'wb') | |
| # work | |
| time_start = clock() | |
| total_project_num = 30945 | |
| project_num = 0 | |
| # crawler | |
| crawler(threads) | |
| entry_save.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment