Created
April 5, 2022 00:56
-
-
Save SuperSonicHub1/61032952b20393e17237ab3dfa29e8cd to your computer and use it in GitHub Desktop.
Hacker News Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
import time | |
from html.parser import HTMLParser | |
from html.entities import name2codepoint | |
import logging | |
import sqlite3 | |
import traceback | |
import requests | |
# Settings | |
# See https://hn.algolia.com/api | |
DATABASE_FILE = "hackernews.db" | |
NUMERIC_FILTERS = "created_at_i<{}" | |
HITS_PER_PAGE = 1_000 | |
TAGS = "story" | |
TIMESTAMP_FILE = "timestamp.txt" | |
NUMBER_PROCESSED_FILE = "number_processed.txt" | |
LOG_FILE = "hn-scrape.log" | |
# Get number processed | |
# Allows us to see progress | |
number_processed = 0 | |
try: | |
with open(NUMBER_PROCESSED_FILE) as f: | |
number_processed = int(f.read()) | |
except: | |
pass | |
# Get number processed | |
# Allows us to keep going if the script fails | |
timestamp = int(time.time()) | |
try: | |
with open(TIMESTAMP_FILE) as f: | |
timestamp = int(f.read()) | |
except: | |
pass | |
# Database setup | |
db = sqlite3.connect(DATABASE_FILE) | |
cursor = db.cursor() | |
cursor.execute('PRAGMA journal_mode=WAL;') | |
cursor.execute(""" | |
CREATE TABLE IF NOT EXISTS hn_stories | |
( | |
object_id INTEGER PRIMARY KEY, | |
created_at TIMESTAMP, | |
title TEXT, | |
author TEXT, | |
url TEXT, | |
story_text TEXT, | |
points INTEGER, | |
num_comments INTEGER | |
); | |
""") | |
# Logging setup | |
logging.basicConfig( | |
filename=LOG_FILE, | |
format='%(asctime)s\t%(levelname)s\t%(message)s', | |
level=logging.DEBUG, | |
) | |
# Utility function for logging exceptions | |
def log_exception(exception: Exception, insert_data: tuple): | |
if insert_data: | |
logging.exception(f"Insertion failed: {insert_data} | {str(vars(exception))}") | |
else: | |
logging.exception('') | |
# Utility function for stripping HTML | |
class StripHTMLParser(HTMLParser): | |
output: str = "" | |
def handle_data(self, data): | |
self.output += data | |
def handle_entityref(self, name): | |
self.output = chr(name2codepoint[name]) | |
def handle_charref(self, name): | |
if name.startswith('x'): | |
c = chr(int(name[1:], 16)) | |
else: | |
c = chr(int(name)) | |
self.output += c | |
def strip_html(html: str) -> str: | |
parser = StripHTMLParser() | |
parser.feed(html) | |
return parser.output | |
# Create dataset | |
INSERT_SQL = """ | |
INSERT INTO hn_stories | |
( | |
object_id, | |
created_at, | |
title, | |
author, | |
url, | |
story_text, | |
points, | |
num_comments | |
) | |
VALUES (?, ?, ?, ?, ?, ?, ?, ?) | |
""" | |
session = requests.Session() | |
while True: | |
try: | |
res = requests.get( | |
"https://hn.algolia.com/api/v1/search_by_date", | |
params=dict( | |
tags=TAGS, | |
hitsPerPage=HITS_PER_PAGE, | |
numericFilters=NUMERIC_FILTERS.format(timestamp), | |
) | |
) | |
res.raise_for_status() | |
data = res.json() | |
stories = data["hits"] | |
timestamp = stories[-1]['created_at_i'] | |
for story in stories: | |
created_at = datetime.fromtimestamp(int(story['created_at_i'])) | |
story_text = story["story_text"] | |
if story_text != None: | |
story_text = strip_html(story_text) | |
insert_data = ( | |
int(story['objectID']), | |
created_at, | |
strip_html(story["title"]), | |
strip_html(story['author']), | |
story['url'], | |
story_text, | |
story['points'], | |
story['num_comments'], | |
) | |
try: | |
cursor.execute(INSERT_SQL, insert_data,) | |
logging.info(f"Story {repr(story['title'])} successfully added!") | |
except Exception as e: | |
log_exception(e, insert_data) | |
db.commit() | |
# If there are no more HN stories, we're done! | |
if (data["nbHits"] < HITS_PER_PAGE): | |
break | |
with open(TIMESTAMP_FILE, "w") as f: | |
f.write(str(timestamp)) | |
number_processed += HITS_PER_PAGE | |
with open(NUMBER_PROCESSED_FILE, "w") as f: | |
f.write(str(number_processed)) | |
logging.info(f"{number_processed} Stories committed!") | |
logging.info(f"Current timestamp is {datetime.fromtimestamp(timestamp).isoformat()}") | |
# Make sure we stay within API limits | |
# We are limiting the number of API requests from a single IP to 10,000 per hour. | |
time.sleep(3600/10000) | |
except Exception as e: | |
log_exception(e, ()) | |
cursor.execute('CREATE UNIQUE INDEX object_idx ON hn_stories (object_id);') | |
cursor.execute('CREATE INDEX created_atx ON hn_stories (created_at);') | |
db.commit() | |
logging.info("DONE!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://github.com/minimaxir/get-all-hacker-news-submissions-comments