Created
December 29, 2025 20:16
-
-
Save thundergolfer/55a0e221d0babcfc1db9bce51b731503 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Sum Hacker News post karma (points) by domain for all story posts in a given year. | |
| Data source: Hacker News Algolia Search API | |
| https://hn.algolia.com/api | |
| By default, common organizational domains (news sites, major platforms like GitHub, | |
| Twitter, YouTube, etc.) are excluded to focus on individual/personal blogs. | |
| The script automatically handles Algolia's 1000-result limit by querying day-by-day | |
| and splitting high-volume days into smaller chunks as needed. | |
| Example: | |
| python hn_top_domains_karma.py --year 2025 --top 10 | |
| python hn_top_domains_karma.py --year 2025 --top 20 --csv out.csv | |
| python hn_top_domains_karma.py --year 2025 --top 10 --include-orgs | |
| python hn_top_domains_karma.py --year 2025 --top 10 --exclude-domains "example.com,test.com" | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import csv | |
| import sys | |
| import time | |
| from collections import defaultdict | |
| from dataclasses import dataclass | |
| from datetime import datetime, timezone | |
| from typing import Iterable | |
| from urllib.parse import urlparse | |
| import requests | |
| ALGOLIA_SEARCH_URL = "https://hn.algolia.com/api/v1/search_by_date" | |
| def utc_epoch_seconds(year: int, month: int, day: int) -> int: | |
| dt = datetime(year, month, day, 0, 0, 0, tzinfo=timezone.utc) | |
| return int(dt.timestamp()) | |
| def normalize_domain(url: str) -> str: | |
| """ | |
| Extract a normalized domain from a URL. | |
| - Lowercase | |
| - Strip leading 'www.' | |
| """ | |
| try: | |
| parsed = urlparse(url) | |
| host = (parsed.netloc or "").lower() | |
| if host.startswith("www."): | |
| host = host[4:] | |
| return host or "" | |
| except Exception: | |
| return "" | |
| def query_time_range( | |
| start: int, | |
| end: int, | |
| session: requests.Session, | |
| polite_sleep_s: float, | |
| seen_ids: set, | |
| ) -> tuple[Iterable[dict], int]: | |
| """ | |
| Query a specific time range and yield all unique hits. | |
| Returns tuple of (hits generator, total_hits_available). | |
| """ | |
| page = 0 | |
| nb_pages = None | |
| nb_hits = None | |
| def _generate(): | |
| nonlocal page, nb_pages, nb_hits | |
| while True: | |
| params = { | |
| "tags": "story", | |
| "numericFilters": f"created_at_i>={start},created_at_i<={end}", | |
| "hitsPerPage": 1000, | |
| "page": page, | |
| } | |
| r = session.get(ALGOLIA_SEARCH_URL, params=params, timeout=30) | |
| r.raise_for_status() | |
| payload = r.json() | |
| hits = payload.get("hits", []) | |
| if nb_pages is None: | |
| nb_pages = int(payload.get("nbPages", 0)) | |
| nb_hits = int(payload.get("nbHits", 0)) | |
| for h in hits: | |
| obj_id = h.get("objectID") | |
| if obj_id and obj_id not in seen_ids: | |
| seen_ids.add(obj_id) | |
| yield h | |
| page += 1 | |
| if page >= nb_pages: | |
| break | |
| if polite_sleep_s > 0: | |
| time.sleep(polite_sleep_s) | |
| gen = _generate() | |
| first_batch = [] | |
| try: | |
| first_hit = next(gen) | |
| first_batch.append(first_hit) | |
| except StopIteration: | |
| pass | |
| def _chain(): | |
| yield from first_batch | |
| yield from gen | |
| return _chain(), nb_hits or 0 | |
| def iter_stories_by_year( | |
| year: int, | |
| session: requests.Session, | |
| polite_sleep_s: float = 0.25, | |
| verbose: bool = False, | |
| ) -> Iterable[dict]: | |
| """ | |
| Yield all story hits for the given year. | |
| Uses numericFilters on created_at_i to bound to the year in UTC. | |
| The Algolia API limits results to 1000 hits per query. To work around this, | |
| we split the year into daily chunks and query each day separately. | |
| For high-volume days (>1000 posts), we automatically split into smaller chunks. | |
| """ | |
| import calendar | |
| seen_ids = set() | |
| now = datetime.now(timezone.utc) | |
| end_month = 12 | |
| end_day = 31 | |
| if year == now.year: | |
| end_month = now.month | |
| end_day = now.day | |
| total_days = sum( | |
| min(calendar.monthrange(year, m)[1], end_day) if m == end_month else calendar.monthrange(year, m)[1] | |
| for m in range(1, end_month + 1) | |
| ) | |
| day_count = 0 | |
| for month in range(1, end_month + 1): | |
| _, days_in_month = calendar.monthrange(year, month) | |
| max_day = min(days_in_month, end_day) if month == end_month else days_in_month | |
| for day in range(1, max_day + 1): | |
| day_count += 1 | |
| day_start = utc_epoch_seconds(year, month, day) | |
| day_end = ( | |
| utc_epoch_seconds(year, month, day + 1) | |
| if day < days_in_month | |
| else (utc_epoch_seconds(year, month + 1, 1) if month < 12 else utc_epoch_seconds(year + 1, 1, 1)) | |
| ) | |
| day_end -= 1 | |
| day_hits = 0 | |
| hits_gen, total_available = query_time_range(day_start, day_end, session, polite_sleep_s, seen_ids) | |
| # TODO: If a 6-hour chunk has ≥1000 posts, we'll lose data. Not currently a problem | |
| # (no chunk has >300 posts), but may need recursive splitting in the future. | |
| if total_available >= 1000: | |
| if verbose: | |
| print( | |
| f" Day {year}-{month:02d}-{day:02d} has {total_available} posts, " | |
| f"splitting into smaller chunks...", | |
| file=sys.stderr, | |
| ) | |
| day_duration = day_end - day_start + 1 | |
| chunk_size = day_duration // 4 | |
| for chunk_idx in range(4): | |
| chunk_start = day_start + (chunk_idx * chunk_size) | |
| chunk_end = day_start + ((chunk_idx + 1) * chunk_size) - 1 | |
| if chunk_idx == 3: | |
| chunk_end = day_end | |
| chunk_gen, _ = query_time_range(chunk_start, chunk_end, session, polite_sleep_s, seen_ids) | |
| for hit in chunk_gen: | |
| day_hits += 1 | |
| yield hit | |
| if polite_sleep_s > 0 and chunk_idx < 3: | |
| time.sleep(polite_sleep_s) | |
| else: | |
| for hit in hits_gen: | |
| day_hits += 1 | |
| yield hit | |
| if verbose: | |
| progress_pct = (day_count / total_days) * 100 | |
| truncated_msg = " (split into chunks)" if total_available >= 1000 else "" | |
| print( | |
| f" [{day_count}/{total_days} days, {progress_pct:.1f}%] " | |
| f"{year}-{month:02d}-{day:02d}: {day_hits} posts{truncated_msg}", | |
| file=sys.stderr, | |
| ) | |
| if polite_sleep_s > 0: | |
| time.sleep(polite_sleep_s) | |
| @dataclass | |
| class DomainStats: | |
| points: int = 0 | |
| posts: int = 0 | |
| def main() -> int: | |
| ap = argparse.ArgumentParser() | |
| current_year = datetime.now(timezone.utc).year | |
| ap.add_argument("--year", type=int, default=current_year, help=f"Year to analyze (UTC). Default: {current_year}") | |
| ap.add_argument("--top", type=int, default=5, help="How many domains to show. Default: 5") | |
| ap.add_argument( | |
| "--include-self", | |
| action="store_true", | |
| help="Include self/Ask HN posts as a bucket (news.ycombinator.com (self)). Default: excluded.", | |
| ) | |
| ap.add_argument( | |
| "--sleep", | |
| type=float, | |
| default=0.25, | |
| help="Polite delay between API page requests (seconds). Default: 0.25", | |
| ) | |
| ap.add_argument("--csv", type=str, default="", help="Optional path to write full results CSV") | |
| ap.add_argument( | |
| "--verbose", | |
| action=argparse.BooleanOptionalAction, | |
| default=True, | |
| help="Show progress while fetching data (default: True)", | |
| ) | |
| ap.add_argument( | |
| "--exclude-domains", | |
| type=str, | |
| default="", | |
| help="Comma-separated list of additional domains to exclude from results (e.g. 'example.com,test.com')", | |
| ) | |
| ap.add_argument( | |
| "--include-orgs", | |
| action="store_true", | |
| help="Include organizational domains (by default, common news sites and major platforms are excluded)", | |
| ) | |
| args = ap.parse_args() | |
| excluded_domains = set() | |
| if args.exclude_domains: | |
| excluded_domains.update(d.strip().lower() for d in args.exclude_domains.split(",") if d.strip()) | |
| if not args.include_orgs: | |
| # These | |
| org_domains = { | |
| "404media.co", | |
| "9to5google.com", | |
| "anthropic.com", | |
| "apnews.com", | |
| "apple.com", | |
| "arstechnica.com", | |
| "arxiv.org", | |
| "axios.com", | |
| "bbc.co.uk", | |
| "bbc.com", | |
| "bleepingcomputer.com", | |
| "blog.cloudflare.com", | |
| "blog.google", | |
| "bloomberg.com", | |
| "bsky.app", | |
| "businessinsider.com", | |
| "cbc.ca", | |
| "cnbc.com", | |
| "cnn.com", | |
| "deepmind.google", | |
| "devblogs.microsoft.com", | |
| "developers.googleblog.com", | |
| "economist.com", | |
| "eff.org", | |
| "electrek.co", | |
| "en.wikipedia.org", | |
| "finance.yahoo.com", | |
| "fly.io", | |
| "forbes.com", | |
| "fortune.com", | |
| "ft.com", | |
| "gist.github.com", | |
| "github.com", | |
| "gizmodo.com", | |
| "heise.de", | |
| "huggingface.co", | |
| "krebsonsecurity.com", | |
| "latimes.com", | |
| "lwn.net", | |
| "macrumors.com", | |
| "medicalxpress.com", | |
| "medium.com", | |
| "mistral.ai", | |
| "nature.com", | |
| "nbcnews.com", | |
| "neowin.net", | |
| "newatlas.com", | |
| "newyorker.com", | |
| "npr.org", | |
| "nytimes.com", | |
| "old.reddit.com", | |
| "openai.com", | |
| "pcgamer.com", | |
| "phoronix.com", | |
| "phys.org", | |
| "politico.com", | |
| "propublica.org", | |
| "quantamagazine.org", | |
| "reddit.com", | |
| "reuters.com", | |
| "science.org", | |
| "scientificamerican.com", | |
| "smithsonianmag.com", | |
| "spectrum.ieee.org", | |
| "stackoverflow.com", | |
| "store.steampowered.com", | |
| "substack.com", | |
| "techcrunch.com", | |
| "techdirt.com", | |
| "theatlantic.com", | |
| "theconversation.com", | |
| "theguardian.com", | |
| "thenewstack.io", | |
| "theregister.com", | |
| "theverge.com", | |
| "tomshardware.com", | |
| "torrentfreak.com", | |
| "twitter.com", | |
| "washingtonpost.com", | |
| "wikipedia.org", | |
| "wired.com", | |
| "wsj.com", | |
| "x.com", | |
| "youtube.com", | |
| "zed.dev", | |
| } | |
| excluded_domains.update(org_domains) | |
| stats: dict[str, DomainStats] = defaultdict(DomainStats) | |
| total_posts = 0 | |
| if args.verbose: | |
| print(f"Fetching all stories from {args.year}...", file=sys.stderr) | |
| print("This will query the Algolia API day-by-day to get complete results.\n", file=sys.stderr) | |
| with requests.Session() as session: | |
| for hit in iter_stories_by_year(args.year, session=session, polite_sleep_s=args.sleep, verbose=args.verbose): | |
| points = hit.get("points") | |
| if points is None: | |
| # Some items may not have points (rare) - treat as 0 | |
| points = 0 | |
| try: | |
| points = int(points) | |
| except Exception: | |
| points = 0 | |
| url = hit.get("url") or "" | |
| domain = normalize_domain(url) | |
| if not domain: | |
| if not args.include_self: | |
| continue | |
| domain = "news.ycombinator.com (self)" | |
| if domain in excluded_domains: | |
| continue | |
| stats[domain].points += points | |
| stats[domain].posts += 1 | |
| total_posts += 1 | |
| if args.verbose: | |
| print(f"\nTotal posts processed: {total_posts}", file=sys.stderr) | |
| print(f"Unique domains: {len(stats)}", file=sys.stderr) | |
| if excluded_domains: | |
| print(f"Excluded domains: {len(excluded_domains)}", file=sys.stderr) | |
| print(file=sys.stderr) | |
| ranked: list[tuple[str, DomainStats]] = sorted(stats.items(), key=lambda kv: kv[1].points, reverse=True) | |
| topn = ranked[: max(0, args.top)] | |
| title = f"Top {len(topn)} domains by total HN points in {args.year} (stories only)" | |
| if excluded_domains and not args.include_orgs: | |
| title += " - excluding organizational domains" | |
| elif excluded_domains: | |
| title += f" - excluding {len(excluded_domains)} domains" | |
| print(f"{title}:\n") | |
| for i, (domain, st) in enumerate(topn, start=1): | |
| avg = (st.points / st.posts) if st.posts else 0.0 | |
| print(f"{i:>2}. {domain:<35} points={st.points:<10} posts={st.posts:<8} avg={avg:.2f}") | |
| if args.csv: | |
| with open(args.csv, "w", newline="", encoding="utf-8") as f: | |
| w = csv.writer(f) | |
| w.writerow(["domain", "total_points", "post_count", "avg_points"]) | |
| for domain, st in ranked: | |
| avg = (st.points / st.posts) if st.posts else 0.0 | |
| w.writerow([domain, st.points, st.posts, f"{avg:.6f}"]) | |
| print(f"\nWrote CSV: {args.csv}") | |
| return 0 | |
| if __name__ == "__main__": | |
| try: | |
| raise SystemExit(main()) | |
| except requests.HTTPError as e: | |
| print(f"HTTP error: {e}", file=sys.stderr) | |
| raise | |
| except requests.RequestException as e: | |
| print(f"Request error: {e}", file=sys.stderr) | |
| raise |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment