Last active
November 14, 2015 02:15
-
-
Save DengYiping/6e37e0e4b344e3e7634a to your computer and use it in GitHub Desktop.
this is the code for crawling college board question of the day and store them in mongoldb. My blog: www.geekinguniverse.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding: UTF-8 -*- | |
import requests | |
import re | |
from bs4 import BeautifulSoup | |
import pymongo | |
import datetime | |
from Queue import Queue | |
from threading import Thread | |
def to_array(origins): | |
array = [] | |
for origin in origins: | |
array.append(unicode(origin)) | |
return array | |
def date_generator(): | |
dates = Queue() | |
base = datetime.date(2014,07,01) #starting time | |
end = datetime.date.today() #end time | |
delta = datetime.timedelta(days=1) | |
base -= delta | |
while(end>base): | |
stamp = 0 | |
base = base+delta | |
time_tuple = base.timetuple() | |
day = time_tuple.tm_mday | |
month = time_tuple.tm_mon | |
year = time_tuple.tm_year | |
stamp += year * 10000 | |
stamp += month * 100 | |
stamp += day | |
dates.put([stamp,base],False) | |
return dates | |
def main(): | |
#establish connection with database | |
dates = date_generator() | |
for i in range(8): | |
t = Thread(target=thread_main, args = (dates,)) | |
t.start() | |
def thread_main(queue): | |
client = pymongo.MongoClient('localhost', 27017) | |
db = client['stest'] | |
qotds = db['qotd'] | |
#establish connection with database | |
while not queue.empty(): | |
date = queue.get(True) | |
[stems, choice_strs] = getQuestion(str(date[0])) | |
dic = {"question": stems, | |
"key": choice_strs, | |
'time': datetime.datetime(*(date[1].timetuple()[:6])) | |
} | |
qotds.insert_one(dic) | |
print date[0] | |
queue.task_done() | |
def getQuestion(date): | |
s = requests.session() | |
url = 'https://sat.collegeboard.org/practice/sat-question-of-the-day?questionId='+date+'&tq=1' | |
h = s.get(url) | |
html = h.content | |
soup = BeautifulSoup(html, 'html.parser') | |
question = soup.find(id='qotdQuestionContainer') | |
stem = question.find('div','questionStem') | |
stems = [] | |
for child in stem.children: | |
if child == '\n': | |
continue | |
else: | |
stems.append(unicode(child)) | |
choices = question.find('fieldset').find_all('label') | |
choice_strs = to_array(choices) | |
return [stems,choice_strs] | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment