|
#!/bin/bash |
|
# catch-up-fetch.sh — Fetch candidate stories from feeds, emit JSONL. Zero LLM. |
|
# Usage: catch-up-fetch.sh SINCE_UNIX OUTPUT_FILE [--deep] |
|
set -u |
|
SINCE=${1:?"need SINCE_UNIX"} |
|
OUT=${2:?"need OUTPUT_FILE"} |
|
DEEP="${3:-}" |
|
SINCE_DATE=$(date -r "$SINCE" +%Y-%m-%d) |
|
UA="Mozilla/5.0 catch-up-fetch" |
|
: > "$OUT" |
|
TMPDIR=$(mktemp -d -t catchup) |
|
trap "rm -rf $TMPDIR" EXIT |
|
|
|
HN_MIN_POINTS=150 |
|
LOBSTERS_MIN=25 |
|
REDDIT_MIN=250 |
|
REDDIT_LIMIT=10 |
|
|
|
if [ "$DEEP" = "--deep" ]; then |
|
HN_MIN_POINTS=75 |
|
LOBSTERS_MIN=15 |
|
REDDIT_MIN=150 |
|
REDDIT_LIMIT=25 |
|
fi |
|
|
|
# Write parser scripts to tempdir |
|
cat > "$TMPDIR/rss.py" <<'PY' |
|
import sys, json |
|
import xml.etree.ElementTree as ET |
|
from email.utils import parsedate_to_datetime |
|
from datetime import datetime |
|
|
|
source = sys.argv[1] |
|
since = sys.argv[2] |
|
data = sys.stdin.buffer.read() |
|
if not data: |
|
sys.exit(0) |
|
try: |
|
root = ET.fromstring(data) |
|
except ET.ParseError: |
|
sys.exit(0) |
|
for el in root.iter(): |
|
if '}' in el.tag: |
|
el.tag = el.tag.split('}', 1)[1] |
|
items = root.findall('.//item') or root.findall('.//entry') |
|
for it in items: |
|
title = (it.findtext('title') or '').strip() |
|
link = (it.findtext('link') or '').strip() |
|
if not link: |
|
le = it.find('link') |
|
if le is not None: |
|
link = le.get('href', '') |
|
pub = it.findtext('published') or it.findtext('updated') or it.findtext('pubDate') or '' |
|
date_str = '' |
|
if pub: |
|
# Try ISO 8601 first (2026-04-16T...), fall back to RFC 822 (Thu, 16 Apr 2026...) |
|
for parser in ( |
|
lambda s: datetime.fromisoformat(s.replace('Z', '+00:00')), |
|
parsedate_to_datetime, |
|
): |
|
try: |
|
date_str = parser(pub).strftime('%Y-%m-%d') |
|
break |
|
except Exception: |
|
continue |
|
# Require a parseable date within window — feeds without item dates flood otherwise |
|
if not date_str or date_str < since: |
|
continue |
|
if not (title and link): |
|
continue |
|
print(json.dumps({"title": title[:300], "url": link, "source": source, "published_at": date_str}, ensure_ascii=False)) |
|
PY |
|
|
|
cat > "$TMPDIR/hn.py" <<'PY' |
|
import json, sys |
|
d = json.load(sys.stdin) |
|
for h in d.get("hits", []): |
|
oid = h.get("objectID", "") |
|
comments = f"https://news.ycombinator.com/item?id={oid}" |
|
url = h.get("url") or comments |
|
print(json.dumps({ |
|
"title": (h.get("title") or "")[:300], |
|
"url": url, |
|
"source": "HN", |
|
"published_at": (h.get("created_at") or "")[:10], |
|
"score": h.get("points") or 0, |
|
"num_comments": h.get("num_comments") or 0, |
|
"comments_url": comments, |
|
})) |
|
PY |
|
|
|
cat > "$TMPDIR/lobsters.py" <<PY |
|
import json, sys |
|
since = "$SINCE_DATE" |
|
minscore = $LOBSTERS_MIN |
|
try: |
|
arr = json.load(sys.stdin) |
|
except Exception: |
|
sys.exit(0) |
|
for it in arr: |
|
score = it.get("score", 0) |
|
if score < minscore: |
|
continue |
|
d = (it.get("created_at") or "")[:10] |
|
if d and d < since: |
|
continue |
|
print(json.dumps({ |
|
"title": (it.get("title") or "")[:300], |
|
"url": it.get("url") or it.get("short_id_url"), |
|
"source": "Lobsters", |
|
"published_at": d, |
|
"score": score, |
|
"comments_url": it.get("short_id_url"), |
|
})) |
|
PY |
|
|
|
cat > "$TMPDIR/reddit.py" <<PY |
|
import json, sys |
|
from datetime import datetime, timezone |
|
since_unix = $SINCE |
|
minscore = $REDDIT_MIN |
|
sub = sys.argv[1] |
|
try: |
|
d = json.load(sys.stdin) |
|
except Exception: |
|
sys.exit(0) |
|
for c in d.get("data", {}).get("children", []): |
|
p = c.get("data", {}) |
|
if (p.get("score") or 0) < minscore: |
|
continue |
|
if (p.get("created_utc") or 0) < since_unix: |
|
continue |
|
date = datetime.fromtimestamp(p.get("created_utc", 0), tz=timezone.utc).strftime("%Y-%m-%d") |
|
url = p.get("url") or ("https://reddit.com" + p.get("permalink", "")) |
|
print(json.dumps({ |
|
"title": (p.get("title") or "")[:300], |
|
"url": url, |
|
"source": "r/" + sub, |
|
"published_at": date, |
|
"score": p.get("score", 0), |
|
"num_comments": p.get("num_comments", 0), |
|
"comments_url": "https://reddit.com" + p.get("permalink", ""), |
|
})) |
|
PY |
|
|
|
fetch_rss() { |
|
curl -s -L -A "$UA" --max-time 12 "$2" | python3 "$TMPDIR/rss.py" "$1" "$SINCE_DATE" >> "$OUT" |
|
} |
|
|
|
fetch_hn() { |
|
curl -s -A "$UA" --max-time 15 \ |
|
"https://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=points%3E${HN_MIN_POINTS},created_at_i%3E${SINCE}&hitsPerPage=60" \ |
|
| python3 "$TMPDIR/hn.py" >> "$OUT" |
|
} |
|
|
|
fetch_lobsters() { |
|
curl -s -A "$UA" --max-time 12 "https://lobste.rs/hottest.json" \ |
|
| python3 "$TMPDIR/lobsters.py" >> "$OUT" |
|
} |
|
|
|
fetch_reddit() { |
|
curl -s -A "$UA" --max-time 12 "https://old.reddit.com/r/$1/top.json?t=week&limit=${REDDIT_LIMIT}" \ |
|
| python3 "$TMPDIR/reddit.py" "$1" >> "$OUT" |
|
} |
|
|
|
FEEDS=( |
|
"Simon Willison|https://simonwillison.net/atom/everything/" |
|
"Dan Luu|https://danluu.com/atom.xml" |
|
"Julia Evans|https://jvns.ca/atom.xml" |
|
"Brandur|https://brandur.org/articles.atom" |
|
"Drew DeVault|https://drewdevault.com/blog/index.xml" |
|
"Armin Ronacher|https://lucumr.pocoo.org/feed.atom" |
|
"Hillel Wayne|https://buttondown.com/hillelwayne/rss" |
|
"Marc Brooker|https://brooker.co.za/blog/rss.xml" |
|
"Murat Demirbas|https://muratbuffalo.blogspot.com/feeds/posts/default" |
|
"Aleksey Charapko|http://charap.co/feed/" |
|
"Martin Kleppmann|https://martin.kleppmann.com/feed.rss" |
|
"Geoffrey Huntley|https://ghuntley.com/rss/" |
|
"Ethan Mollick|https://www.oneusefulthing.org/feed" |
|
"Latent Space|https://www.latent.space/feed" |
|
"Pragmatic Engineer|https://newsletter.pragmaticengineer.com/feed" |
|
"Bytebytego|https://blog.bytebytego.com/feed" |
|
"Morning Paper|https://blog.acolyer.org/feed/" |
|
"Changelog|https://changelog.com/news/feed" |
|
"Last Week in AWS|https://www.lastweekinaws.com/feed/" |
|
"Netflix Tech|https://netflixtechblog.com/feed" |
|
"Stripe|https://stripe.com/blog/feed.rss" |
|
"Figma|https://www.figma.com/blog/feed/atom.xml" |
|
"Cloudflare|https://blog.cloudflare.com/rss/" |
|
"Discord|https://discord.com/blog/rss.xml" |
|
"Meta Engineering|https://engineering.fb.com/feed/" |
|
"DuckDB|https://duckdb.org/feed.xml" |
|
"Materialize|https://materialize.com/rss.xml" |
|
"TigerBeetle|https://tigerbeetle.com/blog/atom.xml" |
|
"PlanetScale|https://planetscale.com/blog/feed.atom" |
|
) |
|
|
|
REDDIT_SUBS=(programming ExperiencedDevs LocalLLaMA ClaudeAI databasedevelopment) |
|
|
|
# Serial fetches to avoid OUT race conditions (curl is I/O-bound so overhead is small) |
|
for entry in "${FEEDS[@]}"; do |
|
src="${entry%%|*}" |
|
url="${entry#*|}" |
|
fetch_rss "$src" "$url" |
|
done |
|
|
|
fetch_hn |
|
fetch_lobsters |
|
for sub in "${REDDIT_SUBS[@]}"; do |
|
fetch_reddit "$sub" |
|
done |
|
|
|
total=$(wc -l < "$OUT" | tr -d ' ') |
|
echo "fetched $total candidates into $OUT" >&2 |