Skip to content

Instantly share code, notes, and snippets.

@SuperSonicHub1
Created April 5, 2022 00:56
Show Gist options
  • Save SuperSonicHub1/61032952b20393e17237ab3dfa29e8cd to your computer and use it in GitHub Desktop.
Save SuperSonicHub1/61032952b20393e17237ab3dfa29e8cd to your computer and use it in GitHub Desktop.
Hacker News Scraper
from datetime import datetime
import time
from html.parser import HTMLParser
from html.entities import name2codepoint
import logging
import sqlite3
import traceback
import requests
# Settings
# See https://hn.algolia.com/api
DATABASE_FILE = "hackernews.db"
NUMERIC_FILTERS = "created_at_i<{}"
HITS_PER_PAGE = 1_000
TAGS = "story"
TIMESTAMP_FILE = "timestamp.txt"
NUMBER_PROCESSED_FILE = "number_processed.txt"
LOG_FILE = "hn-scrape.log"
# Get number processed
# Allows us to see progress
number_processed = 0
try:
with open(NUMBER_PROCESSED_FILE) as f:
number_processed = int(f.read())
except:
pass
# Get number processed
# Allows us to keep going if the script fails
timestamp = int(time.time())
try:
with open(TIMESTAMP_FILE) as f:
timestamp = int(f.read())
except:
pass
# Database setup
db = sqlite3.connect(DATABASE_FILE)
cursor = db.cursor()
cursor.execute('PRAGMA journal_mode=WAL;')
cursor.execute("""
CREATE TABLE IF NOT EXISTS hn_stories
(
object_id INTEGER PRIMARY KEY,
created_at TIMESTAMP,
title TEXT,
author TEXT,
url TEXT,
story_text TEXT,
points INTEGER,
num_comments INTEGER
);
""")
# Logging setup
logging.basicConfig(
filename=LOG_FILE,
format='%(asctime)s\t%(levelname)s\t%(message)s',
level=logging.DEBUG,
)
# Utility function for logging exceptions
def log_exception(exception: Exception, insert_data: tuple):
if insert_data:
logging.exception(f"Insertion failed: {insert_data} | {str(vars(exception))}")
else:
logging.exception('')
# Utility function for stripping HTML
class StripHTMLParser(HTMLParser):
output: str = ""
def handle_data(self, data):
self.output += data
def handle_entityref(self, name):
self.output = chr(name2codepoint[name])
def handle_charref(self, name):
if name.startswith('x'):
c = chr(int(name[1:], 16))
else:
c = chr(int(name))
self.output += c
def strip_html(html: str) -> str:
parser = StripHTMLParser()
parser.feed(html)
return parser.output
# Create dataset
INSERT_SQL = """
INSERT INTO hn_stories
(
object_id,
created_at,
title,
author,
url,
story_text,
points,
num_comments
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
"""
session = requests.Session()
while True:
try:
res = requests.get(
"https://hn.algolia.com/api/v1/search_by_date",
params=dict(
tags=TAGS,
hitsPerPage=HITS_PER_PAGE,
numericFilters=NUMERIC_FILTERS.format(timestamp),
)
)
res.raise_for_status()
data = res.json()
stories = data["hits"]
timestamp = stories[-1]['created_at_i']
for story in stories:
created_at = datetime.fromtimestamp(int(story['created_at_i']))
story_text = story["story_text"]
if story_text != None:
story_text = strip_html(story_text)
insert_data = (
int(story['objectID']),
created_at,
strip_html(story["title"]),
strip_html(story['author']),
story['url'],
story_text,
story['points'],
story['num_comments'],
)
try:
cursor.execute(INSERT_SQL, insert_data,)
logging.info(f"Story {repr(story['title'])} successfully added!")
except Exception as e:
log_exception(e, insert_data)
db.commit()
# If there are no more HN stories, we're done!
if (data["nbHits"] < HITS_PER_PAGE):
break
with open(TIMESTAMP_FILE, "w") as f:
f.write(str(timestamp))
number_processed += HITS_PER_PAGE
with open(NUMBER_PROCESSED_FILE, "w") as f:
f.write(str(number_processed))
logging.info(f"{number_processed} Stories committed!")
logging.info(f"Current timestamp is {datetime.fromtimestamp(timestamp).isoformat()}")
# Make sure we stay within API limits
# We are limiting the number of API requests from a single IP to 10,000 per hour.
time.sleep(3600/10000)
except Exception as e:
log_exception(e, ())
cursor.execute('CREATE UNIQUE INDEX object_idx ON hn_stories (object_id);')
cursor.execute('CREATE INDEX created_atx ON hn_stories (created_at);')
db.commit()
logging.info("DONE!")
@SuperSonicHub1
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment