Created
August 20, 2015 16:31
-
-
Save mcrowe/ea5101a1b8442838ce72 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Fetches stories from Hacker News and stores them in an sqlite db. | |
# | |
# Uses a thread pool of 25 works to do the http requests to make things | |
# faster. | |
# | |
# Usage: | |
# python fetch_hn_stories 1 1000 | |
# | |
from hackernews import HackerNews | |
from threading import Thread | |
from Queue import Queue | |
from time import gmtime, strftime | |
import sys | |
import sqlite3 | |
import pyprind | |
import time | |
if len(sys.argv) < 3: | |
raise Exception('first_id and last_id must be provided') | |
STORY_IDS = range(int(sys.argv[2]), int(sys.argv[1]), -1) | |
NUM_STORY_IDS = len(STORY_IDS) | |
DB_NAME = 'hackernews.db' | |
NUM_FETCHERS = 25 | |
INSERT_ITEM_SQL = """ | |
INSERT OR IGNORE INTO items (item_id, item_type, by, score, comments, url, title, submission_time) | |
VALUES (:item_id, :item_type, :by, :score, :comments, :url, :title, :submission_time) | |
""" | |
def build_item_row(item): | |
comments = len(item.kids) if item.kids else 0 | |
return ( | |
item.item_id, | |
item.item_type, | |
item.by, | |
item.score, | |
comments, | |
item.url, | |
item.title, | |
item.submission_time | |
) | |
def fetcher(): | |
while True: | |
id = id_queue.get() | |
id_queue.task_done() | |
try: | |
item_queue.put( hn.get_item(id) ) | |
except: | |
pass | |
def start_worker(job): | |
t = Thread(target=job) | |
t.daemon = True | |
t.start() | |
hn = HackerNews() | |
id_queue = Queue(NUM_STORY_IDS) | |
item_queue = Queue(5000) | |
# Start fetcher workers. | |
for _ in range(0, NUM_FETCHERS): | |
start_worker(fetcher) | |
# Add ids to fetch queue | |
print('Enqueuing ids ' + str(STORY_IDS[0]) + '-' + str(STORY_IDS[-1])) | |
for story_id in STORY_IDS: | |
id_queue.put(story_id) | |
bar = pyprind.ProgBar(NUM_STORY_IDS, width=100) | |
with sqlite3.connect(DB_NAME) as conn: | |
while not (id_queue.empty() and item_queue.empty()): | |
rows = [] | |
while not item_queue.empty(): | |
item = item_queue.get() | |
item_queue.task_done() | |
bar.update(item_id=item.item_id) | |
if item.item_type == 'story': | |
rows.append( build_item_row(item) ) | |
if len(rows) > 0: | |
conn.executemany(INSERT_ITEM_SQL, rows) | |
conn.commit() | |
time.sleep(4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment