Skip to content

Instantly share code, notes, and snippets.

@chengjun
Last active August 29, 2015 14:03
Show Gist options
  • Save chengjun/4a8a1002c965da24cc20 to your computer and use it in GitHub Desktop.
Save chengjun/4a8a1002c965da24cc20 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Spyder Editor
This temporary script file is located here:
C:\Users\chengwang6\.spyder2\.temp.py
"""
import os
import urllib2
from bs4 import BeautifulSoup
from time import clock
from time import sleep
from random import randint
# Showing 31,065 closed projects with a public gallery. 1295 pages
# aggregate pages-->thread names--->thread replies
'''
1.0 Get thread_names from aggregate pages
'''
# total_page_num = 1295
#def get_thread_list(total_page_num,start_page):
# for page_num in range(start_page, total_page_num + 1):
# # spleeping
# snap_time = 0.01*randint(1,20)
# print "sleeping for %s seconds" % snap_time
# sleep(snap_time)
# # timing
# time_pass = clock() - time_start
# if round(time_pass) % 10 ==0:
# print "working for %s seconds, %d pages scanned (proportion = %.2f%%)" %(round(time_pass), page_num, page_num*100.0/total_page_num)
# # page_num = 1
# # prepare url
# url1 = "http://www.crowdspring.com/browse/?page="
# url2 = "&status=closed&gallery=public"
# url = url1 + str(page_num) + url2
# # urlopen brawser
# page = urllib2.urlopen(url, timeout=10).read()
# soup = BeautifulSoup(page)
# urls = soup.find_all('td', {'class', 'project'})
# scores = soup.find_all('td', {'class', 'score'})
# entries = soup.find_all('span', {'class', 'entries'})
# awards = soup.find_all('td', {'class', 'award'})
# for i in range(len(urls)):
# url = urls[i].a['href']
# score = scores[i].span.string
# entry = entries[i].a.string
# award = awards[i].a.string
# print >> filesave, "%s,%s,%s,%s,%s" % (page_num, url, score, entry, award)
#os.chdir("D:/chengjun/crowdspring/")
#filesave= open('./crowd_thread_list1.csv', 'wb')
#time_start = clock()
#get_thread_list(1295,1)
#filesave.close()
'''
2.0 Get replies for a given thread
'''
def sleeping(): # spleeping
snap_time = 0.001*randint(1,20) #print "sleeping for %s seconds" % snap_time
sleep(snap_time)
def timing(): # timing
time_pass = clock() - time_start
if round(time_pass) % 10 ==0:
print "working for %s seconds" % round(time_pass)
# progress
def progress():
global project_num
project_num += 1
print "%d projects scanned (proportion = %.2f%%)" %(project_num, project_num*100.0/total_project_num)
def scrape_entry(entry):
# entry = entries[26]
rating = entry.find('div', {'class', 'ratingcount'}).string
sequence_id = entry.find('div', {'class', 'entrycount'}).string
entry_url = entry.find_all('a')[1]['href']
user= entry.a['href']
# award info
if entry_url !='#':
try:
award = entry.find('div', {'class', 'flag'}).string
except:
award = "NA"
# spleeping and timing
sleeping()
timing()
# get entry_time
entry_url_full = 'http://www.crowdspring.com' + entry_url
# try many times
attempts = 1
while attempts < 10:
try:
entry_page = urllib2.urlopen(entry_url_full, timeout=20).read()
break
except:
attempts += 1
print "another try!"
sleeping()
#parse the page
entry_soup = BeautifulSoup(entry_page)
entry_time = entry_soup.find_all('p', {'class', 'date'})[0].get_text()
else:
print "withdrawn!!!No entry_time information"
entry_time = 'Withdrawn'
award = 'NA'
return sequence_id, entry_url, user, rating, entry_time, award
def crawler(threads_list):
for thread in threads_list:
# thread = threads[2]
entry_numbers = thread.split(',')[3]
if entry_numbers != "0 entries":
short_url = thread.split(',')[1]
# short_url = '/print-design/project/2284417_pestacides-journal-being-entered-into-chemcore-magazine/'
thread_url = 'http://www.crowdspring.com'+ short_url
# The first page
sleeping()
progress()
# try many times
attempts = 1
while attempts < 10:
try:
thread_page = urllib2.urlopen(thread_url, timeout=20).read()
break
except:
attempts += 1
print "another try!"
sleeping()
# parse the page
soup = BeautifulSoup(thread_page)
project_by = soup.find_all('div', {'class', 'pageheader'})[0].get_text().split('project by ')[-1].strip()
page_num = int(soup.find_all('div', {'class','paginator'})[0].span.get_text().split('of ')[1])
entries = soup.find_all('div', {'class', 'entry-display'})
entry_time = 0
for entry in entries:
try:
entry = scrape_entry(entry)
entry_time = entry[4]
except:
entry = ['error', 'error', 'error', 'error']
print >> entry_save, "%s,%s,%s,%s,%s,%s,%s,%s" % (short_url, project_by, entry[0], entry[1], entry[2], entry[3], entry[4], entry[5])
if entry_time == "Withdrawn":
print "jump out of withdrawns"
break
if page_num > 1:
for sub_page in range(2, page_num + 1):
thread_url_more = thread_url + 'page' + str(sub_page) + '/'
#thread_url_more="http://www.crowdspring.com/small-website/project/31658_crowdspring-homepage/page7/"
# try many times
attempts = 1
while attempts < 10:
try:
thread_page = urllib2.urlopen(thread_url_more, timeout=20).read()
break
except:
attempts += 1
print "another try!"
sleeping()
# parse the page
soup = BeautifulSoup(thread_page)
entries = soup.find_all('div', {'class', 'entry-display'})
for entry in entries:
try:
entry = scrape_entry(entry)
entry_time = entry[4]
except:
entry = ['error', 'error', 'error', 'error']
print >> entry_save, "%s,%s,%s,%s,%s,%s,%s,%s" % (short_url, project_by, entry[0], entry[1], entry[2], entry[3], entry[4], entry[5])
if entry_time == "Withdrawn":
print "jump out of withdrawns"
break
else:
pass
else:
print entry_numbers
os.chdir("D:/chengjun/crowdspring/")
thread_file = open('./crowd_thread_list63.csv', 'r')
threads = thread_file.readlines()
entry_save= open('./crowd_entry_list_3.csv', 'wb')
# work
time_start = clock()
total_project_num = 30945
project_num = 0
# crawler
crawler(threads)
entry_save.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment