Skip to content

Instantly share code, notes, and snippets.

@DengYiping
Last active November 14, 2015 02:15
Show Gist options
  • Save DengYiping/6e37e0e4b344e3e7634a to your computer and use it in GitHub Desktop.
Save DengYiping/6e37e0e4b344e3e7634a to your computer and use it in GitHub Desktop.
this is the code for crawling college board question of the day and store them in mongoldb. My blog: www.geekinguniverse.com
#-*- coding: UTF-8 -*-
import requests
import re
from bs4 import BeautifulSoup
import pymongo
import datetime
from Queue import Queue
from threading import Thread
def to_array(origins):
array = []
for origin in origins:
array.append(unicode(origin))
return array
def date_generator():
dates = Queue()
base = datetime.date(2014,07,01) #starting time
end = datetime.date.today() #end time
delta = datetime.timedelta(days=1)
base -= delta
while(end>base):
stamp = 0
base = base+delta
time_tuple = base.timetuple()
day = time_tuple.tm_mday
month = time_tuple.tm_mon
year = time_tuple.tm_year
stamp += year * 10000
stamp += month * 100
stamp += day
dates.put([stamp,base],False)
return dates
def main():
#establish connection with database
dates = date_generator()
for i in range(8):
t = Thread(target=thread_main, args = (dates,))
t.start()
def thread_main(queue):
client = pymongo.MongoClient('localhost', 27017)
db = client['stest']
qotds = db['qotd']
#establish connection with database
while not queue.empty():
date = queue.get(True)
[stems, choice_strs] = getQuestion(str(date[0]))
dic = {"question": stems,
"key": choice_strs,
'time': datetime.datetime(*(date[1].timetuple()[:6]))
}
qotds.insert_one(dic)
print date[0]
queue.task_done()
def getQuestion(date):
s = requests.session()
url = 'https://sat.collegeboard.org/practice/sat-question-of-the-day?questionId='+date+'&tq=1'
h = s.get(url)
html = h.content
soup = BeautifulSoup(html, 'html.parser')
question = soup.find(id='qotdQuestionContainer')
stem = question.find('div','questionStem')
stems = []
for child in stem.children:
if child == '\n':
continue
else:
stems.append(unicode(child))
choices = question.find('fieldset').find_all('label')
choice_strs = to_array(choices)
return [stems,choice_strs]
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment