Skip to content

Instantly share code, notes, and snippets.

@thundergolfer
Created December 29, 2025 20:16
Show Gist options
  • Select an option

  • Save thundergolfer/55a0e221d0babcfc1db9bce51b731503 to your computer and use it in GitHub Desktop.

Select an option

Save thundergolfer/55a0e221d0babcfc1db9bce51b731503 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Sum Hacker News post karma (points) by domain for all story posts in a given year.
Data source: Hacker News Algolia Search API
https://hn.algolia.com/api
By default, common organizational domains (news sites, major platforms like GitHub,
Twitter, YouTube, etc.) are excluded to focus on individual/personal blogs.
The script automatically handles Algolia's 1000-result limit by querying day-by-day
and splitting high-volume days into smaller chunks as needed.
Example:
python hn_top_domains_karma.py --year 2025 --top 10
python hn_top_domains_karma.py --year 2025 --top 20 --csv out.csv
python hn_top_domains_karma.py --year 2025 --top 10 --include-orgs
python hn_top_domains_karma.py --year 2025 --top 10 --exclude-domains "example.com,test.com"
"""
from __future__ import annotations
import argparse
import csv
import sys
import time
from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Iterable
from urllib.parse import urlparse
import requests
ALGOLIA_SEARCH_URL = "https://hn.algolia.com/api/v1/search_by_date"
def utc_epoch_seconds(year: int, month: int, day: int) -> int:
dt = datetime(year, month, day, 0, 0, 0, tzinfo=timezone.utc)
return int(dt.timestamp())
def normalize_domain(url: str) -> str:
"""
Extract a normalized domain from a URL.
- Lowercase
- Strip leading 'www.'
"""
try:
parsed = urlparse(url)
host = (parsed.netloc or "").lower()
if host.startswith("www."):
host = host[4:]
return host or ""
except Exception:
return ""
def query_time_range(
start: int,
end: int,
session: requests.Session,
polite_sleep_s: float,
seen_ids: set,
) -> tuple[Iterable[dict], int]:
"""
Query a specific time range and yield all unique hits.
Returns tuple of (hits generator, total_hits_available).
"""
page = 0
nb_pages = None
nb_hits = None
def _generate():
nonlocal page, nb_pages, nb_hits
while True:
params = {
"tags": "story",
"numericFilters": f"created_at_i>={start},created_at_i<={end}",
"hitsPerPage": 1000,
"page": page,
}
r = session.get(ALGOLIA_SEARCH_URL, params=params, timeout=30)
r.raise_for_status()
payload = r.json()
hits = payload.get("hits", [])
if nb_pages is None:
nb_pages = int(payload.get("nbPages", 0))
nb_hits = int(payload.get("nbHits", 0))
for h in hits:
obj_id = h.get("objectID")
if obj_id and obj_id not in seen_ids:
seen_ids.add(obj_id)
yield h
page += 1
if page >= nb_pages:
break
if polite_sleep_s > 0:
time.sleep(polite_sleep_s)
gen = _generate()
first_batch = []
try:
first_hit = next(gen)
first_batch.append(first_hit)
except StopIteration:
pass
def _chain():
yield from first_batch
yield from gen
return _chain(), nb_hits or 0
def iter_stories_by_year(
year: int,
session: requests.Session,
polite_sleep_s: float = 0.25,
verbose: bool = False,
) -> Iterable[dict]:
"""
Yield all story hits for the given year.
Uses numericFilters on created_at_i to bound to the year in UTC.
The Algolia API limits results to 1000 hits per query. To work around this,
we split the year into daily chunks and query each day separately.
For high-volume days (>1000 posts), we automatically split into smaller chunks.
"""
import calendar
seen_ids = set()
now = datetime.now(timezone.utc)
end_month = 12
end_day = 31
if year == now.year:
end_month = now.month
end_day = now.day
total_days = sum(
min(calendar.monthrange(year, m)[1], end_day) if m == end_month else calendar.monthrange(year, m)[1]
for m in range(1, end_month + 1)
)
day_count = 0
for month in range(1, end_month + 1):
_, days_in_month = calendar.monthrange(year, month)
max_day = min(days_in_month, end_day) if month == end_month else days_in_month
for day in range(1, max_day + 1):
day_count += 1
day_start = utc_epoch_seconds(year, month, day)
day_end = (
utc_epoch_seconds(year, month, day + 1)
if day < days_in_month
else (utc_epoch_seconds(year, month + 1, 1) if month < 12 else utc_epoch_seconds(year + 1, 1, 1))
)
day_end -= 1
day_hits = 0
hits_gen, total_available = query_time_range(day_start, day_end, session, polite_sleep_s, seen_ids)
# TODO: If a 6-hour chunk has ≥1000 posts, we'll lose data. Not currently a problem
# (no chunk has >300 posts), but may need recursive splitting in the future.
if total_available >= 1000:
if verbose:
print(
f" Day {year}-{month:02d}-{day:02d} has {total_available} posts, "
f"splitting into smaller chunks...",
file=sys.stderr,
)
day_duration = day_end - day_start + 1
chunk_size = day_duration // 4
for chunk_idx in range(4):
chunk_start = day_start + (chunk_idx * chunk_size)
chunk_end = day_start + ((chunk_idx + 1) * chunk_size) - 1
if chunk_idx == 3:
chunk_end = day_end
chunk_gen, _ = query_time_range(chunk_start, chunk_end, session, polite_sleep_s, seen_ids)
for hit in chunk_gen:
day_hits += 1
yield hit
if polite_sleep_s > 0 and chunk_idx < 3:
time.sleep(polite_sleep_s)
else:
for hit in hits_gen:
day_hits += 1
yield hit
if verbose:
progress_pct = (day_count / total_days) * 100
truncated_msg = " (split into chunks)" if total_available >= 1000 else ""
print(
f" [{day_count}/{total_days} days, {progress_pct:.1f}%] "
f"{year}-{month:02d}-{day:02d}: {day_hits} posts{truncated_msg}",
file=sys.stderr,
)
if polite_sleep_s > 0:
time.sleep(polite_sleep_s)
@dataclass
class DomainStats:
points: int = 0
posts: int = 0
def main() -> int:
ap = argparse.ArgumentParser()
current_year = datetime.now(timezone.utc).year
ap.add_argument("--year", type=int, default=current_year, help=f"Year to analyze (UTC). Default: {current_year}")
ap.add_argument("--top", type=int, default=5, help="How many domains to show. Default: 5")
ap.add_argument(
"--include-self",
action="store_true",
help="Include self/Ask HN posts as a bucket (news.ycombinator.com (self)). Default: excluded.",
)
ap.add_argument(
"--sleep",
type=float,
default=0.25,
help="Polite delay between API page requests (seconds). Default: 0.25",
)
ap.add_argument("--csv", type=str, default="", help="Optional path to write full results CSV")
ap.add_argument(
"--verbose",
action=argparse.BooleanOptionalAction,
default=True,
help="Show progress while fetching data (default: True)",
)
ap.add_argument(
"--exclude-domains",
type=str,
default="",
help="Comma-separated list of additional domains to exclude from results (e.g. 'example.com,test.com')",
)
ap.add_argument(
"--include-orgs",
action="store_true",
help="Include organizational domains (by default, common news sites and major platforms are excluded)",
)
args = ap.parse_args()
excluded_domains = set()
if args.exclude_domains:
excluded_domains.update(d.strip().lower() for d in args.exclude_domains.split(",") if d.strip())
if not args.include_orgs:
# These
org_domains = {
"404media.co",
"9to5google.com",
"anthropic.com",
"apnews.com",
"apple.com",
"arstechnica.com",
"arxiv.org",
"axios.com",
"bbc.co.uk",
"bbc.com",
"bleepingcomputer.com",
"blog.cloudflare.com",
"blog.google",
"bloomberg.com",
"bsky.app",
"businessinsider.com",
"cbc.ca",
"cnbc.com",
"cnn.com",
"deepmind.google",
"devblogs.microsoft.com",
"developers.googleblog.com",
"economist.com",
"eff.org",
"electrek.co",
"en.wikipedia.org",
"finance.yahoo.com",
"fly.io",
"forbes.com",
"fortune.com",
"ft.com",
"gist.github.com",
"github.com",
"gizmodo.com",
"heise.de",
"huggingface.co",
"krebsonsecurity.com",
"latimes.com",
"lwn.net",
"macrumors.com",
"medicalxpress.com",
"medium.com",
"mistral.ai",
"nature.com",
"nbcnews.com",
"neowin.net",
"newatlas.com",
"newyorker.com",
"npr.org",
"nytimes.com",
"old.reddit.com",
"openai.com",
"pcgamer.com",
"phoronix.com",
"phys.org",
"politico.com",
"propublica.org",
"quantamagazine.org",
"reddit.com",
"reuters.com",
"science.org",
"scientificamerican.com",
"smithsonianmag.com",
"spectrum.ieee.org",
"stackoverflow.com",
"store.steampowered.com",
"substack.com",
"techcrunch.com",
"techdirt.com",
"theatlantic.com",
"theconversation.com",
"theguardian.com",
"thenewstack.io",
"theregister.com",
"theverge.com",
"tomshardware.com",
"torrentfreak.com",
"twitter.com",
"washingtonpost.com",
"wikipedia.org",
"wired.com",
"wsj.com",
"x.com",
"youtube.com",
"zed.dev",
}
excluded_domains.update(org_domains)
stats: dict[str, DomainStats] = defaultdict(DomainStats)
total_posts = 0
if args.verbose:
print(f"Fetching all stories from {args.year}...", file=sys.stderr)
print("This will query the Algolia API day-by-day to get complete results.\n", file=sys.stderr)
with requests.Session() as session:
for hit in iter_stories_by_year(args.year, session=session, polite_sleep_s=args.sleep, verbose=args.verbose):
points = hit.get("points")
if points is None:
# Some items may not have points (rare) - treat as 0
points = 0
try:
points = int(points)
except Exception:
points = 0
url = hit.get("url") or ""
domain = normalize_domain(url)
if not domain:
if not args.include_self:
continue
domain = "news.ycombinator.com (self)"
if domain in excluded_domains:
continue
stats[domain].points += points
stats[domain].posts += 1
total_posts += 1
if args.verbose:
print(f"\nTotal posts processed: {total_posts}", file=sys.stderr)
print(f"Unique domains: {len(stats)}", file=sys.stderr)
if excluded_domains:
print(f"Excluded domains: {len(excluded_domains)}", file=sys.stderr)
print(file=sys.stderr)
ranked: list[tuple[str, DomainStats]] = sorted(stats.items(), key=lambda kv: kv[1].points, reverse=True)
topn = ranked[: max(0, args.top)]
title = f"Top {len(topn)} domains by total HN points in {args.year} (stories only)"
if excluded_domains and not args.include_orgs:
title += " - excluding organizational domains"
elif excluded_domains:
title += f" - excluding {len(excluded_domains)} domains"
print(f"{title}:\n")
for i, (domain, st) in enumerate(topn, start=1):
avg = (st.points / st.posts) if st.posts else 0.0
print(f"{i:>2}. {domain:<35} points={st.points:<10} posts={st.posts:<8} avg={avg:.2f}")
if args.csv:
with open(args.csv, "w", newline="", encoding="utf-8") as f:
w = csv.writer(f)
w.writerow(["domain", "total_points", "post_count", "avg_points"])
for domain, st in ranked:
avg = (st.points / st.posts) if st.posts else 0.0
w.writerow([domain, st.points, st.posts, f"{avg:.6f}"])
print(f"\nWrote CSV: {args.csv}")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except requests.HTTPError as e:
print(f"HTTP error: {e}", file=sys.stderr)
raise
except requests.RequestException as e:
print(f"Request error: {e}", file=sys.stderr)
raise
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment