thundergolfer · December 29, 2025 20:16
diff --git a/top_hn_blogs.py b/top_hn_blogs.py
 #!/usr/bin/env python3
 """
 Sum Hacker News post karma (points) by domain for all story posts in a given year.

 Data source: Hacker News Algolia Search API
 https://hn.algolia.com/api

 By default, common organizational domains (news sites, major platforms like GitHub,
 Twitter, YouTube, etc.) are excluded to focus on individual/personal blogs.

 The script automatically handles Algolia's 1000-result limit by querying day-by-day
 and splitting high-volume days into smaller chunks as needed.

 Example:
  python hn_top_domains_karma.py --year 2025 --top 10
  python hn_top_domains_karma.py --year 2025 --top 20 --csv out.csv
  python hn_top_domains_karma.py --year 2025 --top 10 --include-orgs
  python hn_top_domains_karma.py --year 2025 --top 10 --exclude-domains "example.com,test.com"
 """

 from __future__ import annotations

 import argparse
 import csv
 import sys
 import time
 from collections import defaultdict
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from typing import Iterable
 from urllib.parse import urlparse

 import requests

 ALGOLIA_SEARCH_URL = "https://hn.algolia.com/api/v1/search_by_date"


 def utc_epoch_seconds(year: int, month: int, day: int) -> int:
    dt = datetime(year, month, day, 0, 0, 0, tzinfo=timezone.utc)
    return int(dt.timestamp())


 def normalize_domain(url: str) -> str:
    """
    Extract a normalized domain from a URL.
    - Lowercase
    - Strip leading 'www.'
    """
    try:
        parsed = urlparse(url)
        host = (parsed.netloc or "").lower()
        if host.startswith("www."):
            host = host[4:]
        return host or ""
    except Exception:
        return ""


 def query_time_range(
    start: int,
    end: int,
    session: requests.Session,
    polite_sleep_s: float,
    seen_ids: set,
 ) -> tuple[Iterable[dict], int]:
    """
    Query a specific time range and yield all unique hits.
    Returns tuple of (hits generator, total_hits_available).
    """
    page = 0
    nb_pages = None
    nb_hits = None

    def _generate():
        nonlocal page, nb_pages, nb_hits
        while True:
            params = {
                "tags": "story",
                "numericFilters": f"created_at_i>={start},created_at_i<={end}",
                "hitsPerPage": 1000,
                "page": page,
            }

            r = session.get(ALGOLIA_SEARCH_URL, params=params, timeout=30)
            r.raise_for_status()
            payload = r.json()

            hits = payload.get("hits", [])
            if nb_pages is None:
                nb_pages = int(payload.get("nbPages", 0))
                nb_hits = int(payload.get("nbHits", 0))

            for h in hits:
                obj_id = h.get("objectID")
                if obj_id and obj_id not in seen_ids:
                    seen_ids.add(obj_id)
                    yield h

            page += 1

            if page >= nb_pages:
                break

            if polite_sleep_s > 0:
                time.sleep(polite_sleep_s)

    gen = _generate()
    first_batch = []
    try:
        first_hit = next(gen)
        first_batch.append(first_hit)
    except StopIteration:
        pass

    def _chain():
        yield from first_batch
        yield from gen

    return _chain(), nb_hits or 0


 def iter_stories_by_year(
    year: int,
    session: requests.Session,
    polite_sleep_s: float = 0.25,
    verbose: bool = False,
 ) -> Iterable[dict]:
    """
    Yield all story hits for the given year.
    Uses numericFilters on created_at_i to bound to the year in UTC.

    The Algolia API limits results to 1000 hits per query. To work around this,
    we split the year into daily chunks and query each day separately.
    For high-volume days (>1000 posts), we automatically split into smaller chunks.
    """
    import calendar

    seen_ids = set()
    now = datetime.now(timezone.utc)
    end_month = 12
    end_day = 31

    if year == now.year:
        end_month = now.month
        end_day = now.day

    total_days = sum(
        min(calendar.monthrange(year, m)[1], end_day) if m == end_month else calendar.monthrange(year, m)[1]
        for m in range(1, end_month + 1)
    )
    day_count = 0

    for month in range(1, end_month + 1):
        _, days_in_month = calendar.monthrange(year, month)
        max_day = min(days_in_month, end_day) if month == end_month else days_in_month
        for day in range(1, max_day + 1):
            day_count += 1
            day_start = utc_epoch_seconds(year, month, day)
            day_end = (
                utc_epoch_seconds(year, month, day + 1)
                if day < days_in_month
                else (utc_epoch_seconds(year, month + 1, 1) if month < 12 else utc_epoch_seconds(year + 1, 1, 1))
            )
            day_end -= 1

            day_hits = 0

            hits_gen, total_available = query_time_range(day_start, day_end, session, polite_sleep_s, seen_ids)

            # TODO: If a 6-hour chunk has ≥1000 posts, we'll lose data. Not currently a problem
            # (no chunk has >300 posts), but may need recursive splitting in the future.
            if total_available >= 1000:
                if verbose:
                    print(
                        f"  Day {year}-{month:02d}-{day:02d} has {total_available} posts, "
                        f"splitting into smaller chunks...",
                        file=sys.stderr,
                    )

                day_duration = day_end - day_start + 1
                chunk_size = day_duration // 4

                for chunk_idx in range(4):
                    chunk_start = day_start + (chunk_idx * chunk_size)
                    chunk_end = day_start + ((chunk_idx + 1) * chunk_size) - 1
                    if chunk_idx == 3:
                        chunk_end = day_end

                    chunk_gen, _ = query_time_range(chunk_start, chunk_end, session, polite_sleep_s, seen_ids)
                    for hit in chunk_gen:
                        day_hits += 1
                        yield hit

                    if polite_sleep_s > 0 and chunk_idx < 3:
                        time.sleep(polite_sleep_s)
            else:
                for hit in hits_gen:
                    day_hits += 1
                    yield hit

            if verbose:
                progress_pct = (day_count / total_days) * 100
                truncated_msg = " (split into chunks)" if total_available >= 1000 else ""
                print(
                    f"  [{day_count}/{total_days} days, {progress_pct:.1f}%] "
                    f"{year}-{month:02d}-{day:02d}: {day_hits} posts{truncated_msg}",
                    file=sys.stderr,
                )

            if polite_sleep_s > 0:
                time.sleep(polite_sleep_s)


 @dataclass
 class DomainStats:
    points: int = 0
    posts: int = 0


 def main() -> int:
    ap = argparse.ArgumentParser()
    current_year = datetime.now(timezone.utc).year
    ap.add_argument("--year", type=int, default=current_year, help=f"Year to analyze (UTC). Default: {current_year}")
    ap.add_argument("--top", type=int, default=5, help="How many domains to show. Default: 5")
    ap.add_argument(
        "--include-self",
        action="store_true",
        help="Include self/Ask HN posts as a bucket (news.ycombinator.com (self)). Default: excluded.",
    )
    ap.add_argument(
        "--sleep",
        type=float,
        default=0.25,
        help="Polite delay between API page requests (seconds). Default: 0.25",
    )
    ap.add_argument("--csv", type=str, default="", help="Optional path to write full results CSV")
    ap.add_argument(
        "--verbose",
        action=argparse.BooleanOptionalAction,
        default=True,
        help="Show progress while fetching data (default: True)",
    )
    ap.add_argument(
        "--exclude-domains",
        type=str,
        default="",
        help="Comma-separated list of additional domains to exclude from results (e.g. 'example.com,test.com')",
    )
    ap.add_argument(
        "--include-orgs",
        action="store_true",
        help="Include organizational domains (by default, common news sites and major platforms are excluded)",
    )
    args = ap.parse_args()

    excluded_domains = set()
    if args.exclude_domains:
        excluded_domains.update(d.strip().lower() for d in args.exclude_domains.split(",") if d.strip())

    if not args.include_orgs:
        # These
        org_domains = {
            "404media.co",
            "9to5google.com",
            "anthropic.com",
            "apnews.com",
            "apple.com",
            "arstechnica.com",
            "arxiv.org",
            "axios.com",
            "bbc.co.uk",
            "bbc.com",
            "bleepingcomputer.com",
            "blog.cloudflare.com",
            "blog.google",
            "bloomberg.com",
            "bsky.app",
            "businessinsider.com",
            "cbc.ca",
            "cnbc.com",
            "cnn.com",
            "deepmind.google",
            "devblogs.microsoft.com",
            "developers.googleblog.com",
            "economist.com",
            "eff.org",
            "electrek.co",
            "en.wikipedia.org",
            "finance.yahoo.com",
            "fly.io",
            "forbes.com",
            "fortune.com",
            "ft.com",
            "gist.github.com",
            "github.com",
            "gizmodo.com",
            "heise.de",
            "huggingface.co",
            "krebsonsecurity.com",
            "latimes.com",
            "lwn.net",
            "macrumors.com",
            "medicalxpress.com",
            "medium.com",
            "mistral.ai",
            "nature.com",
            "nbcnews.com",
            "neowin.net",
            "newatlas.com",
            "newyorker.com",
            "npr.org",
            "nytimes.com",
            "old.reddit.com",
            "openai.com",
            "pcgamer.com",
            "phoronix.com",
            "phys.org",
            "politico.com",
            "propublica.org",
            "quantamagazine.org",
            "reddit.com",
            "reuters.com",
            "science.org",
            "scientificamerican.com",
            "smithsonianmag.com",
            "spectrum.ieee.org",
            "stackoverflow.com",
            "store.steampowered.com",
            "substack.com",
            "techcrunch.com",
            "techdirt.com",
            "theatlantic.com",
            "theconversation.com",
            "theguardian.com",
            "thenewstack.io",
            "theregister.com",
            "theverge.com",
            "tomshardware.com",
            "torrentfreak.com",
            "twitter.com",
            "washingtonpost.com",
            "wikipedia.org",
            "wired.com",
            "wsj.com",
            "x.com",
            "youtube.com",
            "zed.dev",
        }
        excluded_domains.update(org_domains)

    stats: dict[str, DomainStats] = defaultdict(DomainStats)
    total_posts = 0

    if args.verbose:
        print(f"Fetching all stories from {args.year}...", file=sys.stderr)
        print("This will query the Algolia API day-by-day to get complete results.\n", file=sys.stderr)

    with requests.Session() as session:
        for hit in iter_stories_by_year(args.year, session=session, polite_sleep_s=args.sleep, verbose=args.verbose):
            points = hit.get("points")
            if points is None:
                # Some items may not have points (rare) - treat as 0
                points = 0
            try:
                points = int(points)
            except Exception:
                points = 0

            url = hit.get("url") or ""
            domain = normalize_domain(url)

            if not domain:
                if not args.include_self:
                    continue
                domain = "news.ycombinator.com (self)"

            if domain in excluded_domains:
                continue

            stats[domain].points += points
            stats[domain].posts += 1
            total_posts += 1

    if args.verbose:
        print(f"\nTotal posts processed: {total_posts}", file=sys.stderr)
        print(f"Unique domains: {len(stats)}", file=sys.stderr)
        if excluded_domains:
            print(f"Excluded domains: {len(excluded_domains)}", file=sys.stderr)
        print(file=sys.stderr)

    ranked: list[tuple[str, DomainStats]] = sorted(stats.items(), key=lambda kv: kv[1].points, reverse=True)

    topn = ranked[: max(0, args.top)]
    title = f"Top {len(topn)} domains by total HN points in {args.year} (stories only)"
    if excluded_domains and not args.include_orgs:
        title += " - excluding organizational domains"
    elif excluded_domains:
        title += f" - excluding {len(excluded_domains)} domains"
    print(f"{title}:\n")
    for i, (domain, st) in enumerate(topn, start=1):
        avg = (st.points / st.posts) if st.posts else 0.0
        print(f"{i:>2}. {domain:<35}  points={st.points:<10} posts={st.posts:<8} avg={avg:.2f}")

    if args.csv:
        with open(args.csv, "w", newline="", encoding="utf-8") as f:
            w = csv.writer(f)
            w.writerow(["domain", "total_points", "post_count", "avg_points"])
            for domain, st in ranked:
                avg = (st.points / st.posts) if st.posts else 0.0
                w.writerow([domain, st.points, st.posts, f"{avg:.6f}"])
        print(f"\nWrote CSV: {args.csv}")

    return 0


 if __name__ == "__main__":
    try:
        raise SystemExit(main())
    except requests.HTTPError as e:
        print(f"HTTP error: {e}", file=sys.stderr)
        raise
    except requests.RequestException as e:
        print(f"Request error: {e}", file=sys.stderr)
        raise
	#!/usr/bin/env python3
	"""
	Sum Hacker News post karma (points) by domain for all story posts in a given year.

	Data source: Hacker News Algolia Search API
	https://hn.algolia.com/api

	By default, common organizational domains (news sites, major platforms like GitHub,
	Twitter, YouTube, etc.) are excluded to focus on individual/personal blogs.

	The script automatically handles Algolia's 1000-result limit by querying day-by-day
	and splitting high-volume days into smaller chunks as needed.

	Example:
	python hn_top_domains_karma.py --year 2025 --top 10
	python hn_top_domains_karma.py --year 2025 --top 20 --csv out.csv
	python hn_top_domains_karma.py --year 2025 --top 10 --include-orgs
	python hn_top_domains_karma.py --year 2025 --top 10 --exclude-domains "example.com,test.com"
	"""

	from __future__ import annotations

	import argparse
	import csv
	import sys
	import time
	from collections import defaultdict
	from dataclasses import dataclass
	from datetime import datetime, timezone
	from typing import Iterable
	from urllib.parse import urlparse

	import requests

	ALGOLIA_SEARCH_URL = "https://hn.algolia.com/api/v1/search_by_date"


	def utc_epoch_seconds(year: int, month: int, day: int) -> int:
	dt = datetime(year, month, day, 0, 0, 0, tzinfo=timezone.utc)
	return int(dt.timestamp())


	def normalize_domain(url: str) -> str:
	"""
	Extract a normalized domain from a URL.
	- Lowercase
	- Strip leading 'www.'
	"""
	try:
	parsed = urlparse(url)
	host = (parsed.netloc or "").lower()
	if host.startswith("www."):
	host = host[4:]
	return host or ""
	except Exception:
	return ""


	def query_time_range(
	start: int,
	end: int,
	session: requests.Session,
	polite_sleep_s: float,
	seen_ids: set,
	) -> tuple[Iterable[dict], int]:
	"""
	Query a specific time range and yield all unique hits.
	Returns tuple of (hits generator, total_hits_available).
	"""
	page = 0
	nb_pages = None
	nb_hits = None

	def _generate():
	nonlocal page, nb_pages, nb_hits
	while True:
	params = {
	"tags": "story",
	"numericFilters": f"created_at_i>={start},created_at_i<={end}",
	"hitsPerPage": 1000,
	"page": page,
	}

	r = session.get(ALGOLIA_SEARCH_URL, params=params, timeout=30)
	r.raise_for_status()
	payload = r.json()

	hits = payload.get("hits", [])
	if nb_pages is None:
	nb_pages = int(payload.get("nbPages", 0))
	nb_hits = int(payload.get("nbHits", 0))

	for h in hits:
	obj_id = h.get("objectID")
	if obj_id and obj_id not in seen_ids:
	seen_ids.add(obj_id)
	yield h

	page += 1

	if page >= nb_pages:
	break

	if polite_sleep_s > 0:
	time.sleep(polite_sleep_s)

	gen = _generate()
	first_batch = []
	try:
	first_hit = next(gen)
	first_batch.append(first_hit)
	except StopIteration:
	pass

	def _chain():
	yield from first_batch
	yield from gen

	return _chain(), nb_hits or 0


	def iter_stories_by_year(
	year: int,
	session: requests.Session,
	polite_sleep_s: float = 0.25,
	verbose: bool = False,
	) -> Iterable[dict]:
	"""
	Yield all story hits for the given year.
	Uses numericFilters on created_at_i to bound to the year in UTC.

	The Algolia API limits results to 1000 hits per query. To work around this,
	we split the year into daily chunks and query each day separately.
	For high-volume days (>1000 posts), we automatically split into smaller chunks.
	"""
	import calendar

	seen_ids = set()
	now = datetime.now(timezone.utc)
	end_month = 12
	end_day = 31

	if year == now.year:
	end_month = now.month
	end_day = now.day

	total_days = sum(
	min(calendar.monthrange(year, m)[1], end_day) if m == end_month else calendar.monthrange(year, m)[1]
	for m in range(1, end_month + 1)
	)
	day_count = 0

	for month in range(1, end_month + 1):
	_, days_in_month = calendar.monthrange(year, month)
	max_day = min(days_in_month, end_day) if month == end_month else days_in_month
	for day in range(1, max_day + 1):
	day_count += 1
	day_start = utc_epoch_seconds(year, month, day)
	day_end = (
	utc_epoch_seconds(year, month, day + 1)
	if day < days_in_month
	else (utc_epoch_seconds(year, month + 1, 1) if month < 12 else utc_epoch_seconds(year + 1, 1, 1))
	)
	day_end -= 1

	day_hits = 0

	hits_gen, total_available = query_time_range(day_start, day_end, session, polite_sleep_s, seen_ids)

	# TODO: If a 6-hour chunk has ≥1000 posts, we'll lose data. Not currently a problem
	# (no chunk has >300 posts), but may need recursive splitting in the future.
	if total_available >= 1000:
	if verbose:
	print(
	f" Day {year}-{month:02d}-{day:02d} has {total_available} posts, "
	f"splitting into smaller chunks...",
	file=sys.stderr,
	)

	day_duration = day_end - day_start + 1
	chunk_size = day_duration // 4

	for chunk_idx in range(4):
	chunk_start = day_start + (chunk_idx * chunk_size)
	chunk_end = day_start + ((chunk_idx + 1) * chunk_size) - 1
	if chunk_idx == 3:
	chunk_end = day_end

	chunk_gen, _ = query_time_range(chunk_start, chunk_end, session, polite_sleep_s, seen_ids)
	for hit in chunk_gen:
	day_hits += 1
	yield hit

	if polite_sleep_s > 0 and chunk_idx < 3:
	time.sleep(polite_sleep_s)
	else:
	for hit in hits_gen:
	day_hits += 1
	yield hit

	if verbose:
	progress_pct = (day_count / total_days) * 100
	truncated_msg = " (split into chunks)" if total_available >= 1000 else ""
	print(
	f" [{day_count}/{total_days} days, {progress_pct:.1f}%] "
	f"{year}-{month:02d}-{day:02d}: {day_hits} posts{truncated_msg}",
	file=sys.stderr,
	)

	if polite_sleep_s > 0:
	time.sleep(polite_sleep_s)


	@dataclass
	class DomainStats:
	points: int = 0
	posts: int = 0


	def main() -> int:
	ap = argparse.ArgumentParser()
	current_year = datetime.now(timezone.utc).year
	ap.add_argument("--year", type=int, default=current_year, help=f"Year to analyze (UTC). Default: {current_year}")
	ap.add_argument("--top", type=int, default=5, help="How many domains to show. Default: 5")
	ap.add_argument(
	"--include-self",
	action="store_true",
	help="Include self/Ask HN posts as a bucket (news.ycombinator.com (self)). Default: excluded.",
	)
	ap.add_argument(
	"--sleep",
	type=float,
	default=0.25,
	help="Polite delay between API page requests (seconds). Default: 0.25",
	)
	ap.add_argument("--csv", type=str, default="", help="Optional path to write full results CSV")
	ap.add_argument(
	"--verbose",
	action=argparse.BooleanOptionalAction,
	default=True,
	help="Show progress while fetching data (default: True)",
	)
	ap.add_argument(
	"--exclude-domains",
	type=str,
	default="",
	help="Comma-separated list of additional domains to exclude from results (e.g. 'example.com,test.com')",
	)
	ap.add_argument(
	"--include-orgs",
	action="store_true",
	help="Include organizational domains (by default, common news sites and major platforms are excluded)",
	)
	args = ap.parse_args()

	excluded_domains = set()
	if args.exclude_domains:
	excluded_domains.update(d.strip().lower() for d in args.exclude_domains.split(",") if d.strip())

	if not args.include_orgs:
	# These
	org_domains = {
	"404media.co",
	"9to5google.com",
	"anthropic.com",
	"apnews.com",
	"apple.com",
	"arstechnica.com",
	"arxiv.org",
	"axios.com",
	"bbc.co.uk",
	"bbc.com",
	"bleepingcomputer.com",
	"blog.cloudflare.com",
	"blog.google",
	"bloomberg.com",
	"bsky.app",
	"businessinsider.com",
	"cbc.ca",
	"cnbc.com",
	"cnn.com",
	"deepmind.google",
	"devblogs.microsoft.com",
	"developers.googleblog.com",
	"economist.com",
	"eff.org",
	"electrek.co",
	"en.wikipedia.org",
	"finance.yahoo.com",
	"fly.io",
	"forbes.com",
	"fortune.com",
	"ft.com",
	"gist.github.com",
	"github.com",
	"gizmodo.com",
	"heise.de",
	"huggingface.co",
	"krebsonsecurity.com",
	"latimes.com",
	"lwn.net",
	"macrumors.com",
	"medicalxpress.com",
	"medium.com",
	"mistral.ai",
	"nature.com",
	"nbcnews.com",
	"neowin.net",
	"newatlas.com",
	"newyorker.com",
	"npr.org",
	"nytimes.com",
	"old.reddit.com",
	"openai.com",
	"pcgamer.com",
	"phoronix.com",
	"phys.org",
	"politico.com",
	"propublica.org",
	"quantamagazine.org",
	"reddit.com",
	"reuters.com",
	"science.org",
	"scientificamerican.com",
	"smithsonianmag.com",
	"spectrum.ieee.org",
	"stackoverflow.com",
	"store.steampowered.com",
	"substack.com",
	"techcrunch.com",
	"techdirt.com",
	"theatlantic.com",
	"theconversation.com",
	"theguardian.com",
	"thenewstack.io",
	"theregister.com",
	"theverge.com",
	"tomshardware.com",
	"torrentfreak.com",
	"twitter.com",
	"washingtonpost.com",
	"wikipedia.org",
	"wired.com",
	"wsj.com",
	"x.com",
	"youtube.com",
	"zed.dev",
	}
	excluded_domains.update(org_domains)

	stats: dict[str, DomainStats] = defaultdict(DomainStats)
	total_posts = 0

	if args.verbose:
	print(f"Fetching all stories from {args.year}...", file=sys.stderr)
	print("This will query the Algolia API day-by-day to get complete results.\n", file=sys.stderr)

	with requests.Session() as session:
	for hit in iter_stories_by_year(args.year, session=session, polite_sleep_s=args.sleep, verbose=args.verbose):
	points = hit.get("points")
	if points is None:
	# Some items may not have points (rare) - treat as 0
	points = 0
	try:
	points = int(points)
	except Exception:
	points = 0

	url = hit.get("url") or ""
	domain = normalize_domain(url)

	if not domain:
	if not args.include_self:
	continue
	domain = "news.ycombinator.com (self)"

	if domain in excluded_domains:
	continue

	stats[domain].points += points
	stats[domain].posts += 1
	total_posts += 1

	if args.verbose:
	print(f"\nTotal posts processed: {total_posts}", file=sys.stderr)
	print(f"Unique domains: {len(stats)}", file=sys.stderr)
	if excluded_domains:
	print(f"Excluded domains: {len(excluded_domains)}", file=sys.stderr)
	print(file=sys.stderr)

	ranked: list[tuple[str, DomainStats]] = sorted(stats.items(), key=lambda kv: kv[1].points, reverse=True)

	topn = ranked[: max(0, args.top)]
	title = f"Top {len(topn)} domains by total HN points in {args.year} (stories only)"
	if excluded_domains and not args.include_orgs:
	title += " - excluding organizational domains"
	elif excluded_domains:
	title += f" - excluding {len(excluded_domains)} domains"
	print(f"{title}:\n")
	for i, (domain, st) in enumerate(topn, start=1):
	avg = (st.points / st.posts) if st.posts else 0.0
	print(f"{i:>2}. {domain:<35} points={st.points:<10} posts={st.posts:<8} avg={avg:.2f}")

	if args.csv:
	with open(args.csv, "w", newline="", encoding="utf-8") as f:
	w = csv.writer(f)
	w.writerow(["domain", "total_points", "post_count", "avg_points"])
	for domain, st in ranked:
	avg = (st.points / st.posts) if st.posts else 0.0
	w.writerow([domain, st.points, st.posts, f"{avg:.6f}"])
	print(f"\nWrote CSV: {args.csv}")

	return 0


	if __name__ == "__main__":
	try:
	raise SystemExit(main())
	except requests.HTTPError as e:
	print(f"HTTP error: {e}", file=sys.stderr)
	raise
	except requests.RequestException as e:
	print(f"Request error: {e}", file=sys.stderr)
	raise
No results found