Skip to content

Instantly share code, notes, and snippets.

@plurch
Created May 14, 2026 16:44
Show Gist options
  • Select an option

  • Save plurch/8eed5e8e62f41414c4f15d67fb722eee to your computer and use it in GitHub Desktop.

Select an option

Save plurch/8eed5e8e62f41414c4f15d67fb722eee to your computer and use it in GitHub Desktop.
"""
Fetch all stargazers for a GitHub repo with starred_at timestamps and write to CSV.
Usage:
python fetch_stargazers.py <owner/repo> [output.csv]
Requires:
GITHUB_AUTH environment variable with a GitHub token.
Notes:
- Uses the `application/vnd.github.star+json` Accept header to include
`starred_at` timestamps (otherwise the API returns just the user object).
- Paginates 100 per page. Max 40,000 stargazers returnable (GitHub's
pagination limit of 400 pages * 100 per page). Repos with more than
40k stars will be truncated at that boundary.
- Honors X-RateLimit-Remaining / X-RateLimit-Reset and sleeps if needed.
- Retries on 5xx and 429 with exponential backoff.
"""
import csv
import os
import sys
import time
from typing import Iterator
import requests
API_ROOT = "https://api.github.com"
PER_PAGE = 100
MAX_PAGES = 400 # GitHub hard limit: 40,000 stargazers via this endpoint
ACCEPT_HEADER = "application/vnd.github.star+json"
MAX_RETRIES = 5
def make_session(token: str) -> requests.Session:
s = requests.Session()
s.headers.update({
"Accept": ACCEPT_HEADER,
"Authorization": f"Bearer {token}",
"X-GitHub-Api-Version": "2022-11-28",
"User-Agent": "stargazer-fetcher",
})
return s
def handle_rate_limit(resp: requests.Response) -> None:
"""If we're close to or hit a rate limit, sleep until reset."""
remaining = resp.headers.get("X-RateLimit-Remaining")
reset = resp.headers.get("X-RateLimit-Reset")
if remaining is not None and int(remaining) == 0 and reset is not None:
sleep_for = max(int(reset) - int(time.time()), 0) + 1
print(f" rate limit hit; sleeping {sleep_for}s until reset", file=sys.stderr)
time.sleep(sleep_for)
def get_with_retry(session: requests.Session, url: str, params: dict) -> requests.Response:
"""GET with retry on transient failures and rate-limit handling."""
for attempt in range(MAX_RETRIES):
resp = session.get(url, params=params, timeout=30)
if resp.status_code == 200:
return resp
if resp.status_code in (403, 429):
# 403 from GitHub on this endpoint usually means rate limit (secondary or primary)
retry_after = resp.headers.get("Retry-After")
if retry_after:
sleep_for = int(retry_after) + 1
print(f" got {resp.status_code}, Retry-After={retry_after}s", file=sys.stderr)
time.sleep(sleep_for)
else:
handle_rate_limit(resp)
continue
if 500 <= resp.status_code < 600:
backoff = 2 ** attempt
print(f" got {resp.status_code}, backing off {backoff}s", file=sys.stderr)
time.sleep(backoff)
continue
# Non-retryable error
resp.raise_for_status()
resp.raise_for_status()
return resp # unreachable
def iter_stargazers(session: requests.Session, repo: str) -> Iterator[dict]:
"""Yield {starred_at, user_login, user_id} for every stargazer."""
url = f"{API_ROOT}/repos/{repo}/stargazers"
for page in range(1, MAX_PAGES + 1):
resp = get_with_retry(session, url, {"per_page": PER_PAGE, "page": page})
rows = resp.json()
if not rows:
return
for row in rows:
# With the star+json Accept header, each row is {starred_at, user: {...}}
user = row.get("user") or {}
yield {
"starred_at": row.get("starred_at", ""),
"user_login": user.get("login", ""),
"user_id": user.get("id", ""),
}
if len(rows) < PER_PAGE:
return
# Light pacing to be polite even when not rate-limited
time.sleep(0.05)
def main() -> int:
if len(sys.argv) < 2:
print("Usage: python fetch_stargazers.py <owner/repo> [output.csv]", file=sys.stderr)
return 2
repo = sys.argv[1]
if "/" not in repo:
print(f"Repo must be in 'owner/repo' format, got: {repo}", file=sys.stderr)
return 2
out_path = sys.argv[2] if len(sys.argv) >= 3 else f"{repo.replace('/', '_')}_stargazers.csv"
token = os.environ.get("GITHUB_AUTH")
if not token:
print("GITHUB_AUTH environment variable is not set", file=sys.stderr)
return 2
session = make_session(token)
print(f"Fetching stargazers for {repo} -> {out_path}", file=sys.stderr)
count = 0
with open(out_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["starred_at", "user_login", "user_id"])
writer.writeheader()
for row in iter_stargazers(session, repo):
writer.writerow(row)
count += 1
if count % 500 == 0:
print(f" {count} stargazers...", file=sys.stderr)
print(f"Done. Wrote {count} rows to {out_path}", file=sys.stderr)
if count >= MAX_PAGES * PER_PAGE:
print(
f"WARNING: hit GitHub's pagination cap ({MAX_PAGES * PER_PAGE}). "
"Results may be truncated.",
file=sys.stderr,
)
return 0
if __name__ == "__main__":
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment