Created
May 14, 2026 16:44
-
-
Save plurch/8eed5e8e62f41414c4f15d67fb722eee to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Fetch all stargazers for a GitHub repo with starred_at timestamps and write to CSV. | |
| Usage: | |
| python fetch_stargazers.py <owner/repo> [output.csv] | |
| Requires: | |
| GITHUB_AUTH environment variable with a GitHub token. | |
| Notes: | |
| - Uses the `application/vnd.github.star+json` Accept header to include | |
| `starred_at` timestamps (otherwise the API returns just the user object). | |
| - Paginates 100 per page. Max 40,000 stargazers returnable (GitHub's | |
| pagination limit of 400 pages * 100 per page). Repos with more than | |
| 40k stars will be truncated at that boundary. | |
| - Honors X-RateLimit-Remaining / X-RateLimit-Reset and sleeps if needed. | |
| - Retries on 5xx and 429 with exponential backoff. | |
| """ | |
| import csv | |
| import os | |
| import sys | |
| import time | |
| from typing import Iterator | |
| import requests | |
| API_ROOT = "https://api.github.com" | |
| PER_PAGE = 100 | |
| MAX_PAGES = 400 # GitHub hard limit: 40,000 stargazers via this endpoint | |
| ACCEPT_HEADER = "application/vnd.github.star+json" | |
| MAX_RETRIES = 5 | |
| def make_session(token: str) -> requests.Session: | |
| s = requests.Session() | |
| s.headers.update({ | |
| "Accept": ACCEPT_HEADER, | |
| "Authorization": f"Bearer {token}", | |
| "X-GitHub-Api-Version": "2022-11-28", | |
| "User-Agent": "stargazer-fetcher", | |
| }) | |
| return s | |
| def handle_rate_limit(resp: requests.Response) -> None: | |
| """If we're close to or hit a rate limit, sleep until reset.""" | |
| remaining = resp.headers.get("X-RateLimit-Remaining") | |
| reset = resp.headers.get("X-RateLimit-Reset") | |
| if remaining is not None and int(remaining) == 0 and reset is not None: | |
| sleep_for = max(int(reset) - int(time.time()), 0) + 1 | |
| print(f" rate limit hit; sleeping {sleep_for}s until reset", file=sys.stderr) | |
| time.sleep(sleep_for) | |
| def get_with_retry(session: requests.Session, url: str, params: dict) -> requests.Response: | |
| """GET with retry on transient failures and rate-limit handling.""" | |
| for attempt in range(MAX_RETRIES): | |
| resp = session.get(url, params=params, timeout=30) | |
| if resp.status_code == 200: | |
| return resp | |
| if resp.status_code in (403, 429): | |
| # 403 from GitHub on this endpoint usually means rate limit (secondary or primary) | |
| retry_after = resp.headers.get("Retry-After") | |
| if retry_after: | |
| sleep_for = int(retry_after) + 1 | |
| print(f" got {resp.status_code}, Retry-After={retry_after}s", file=sys.stderr) | |
| time.sleep(sleep_for) | |
| else: | |
| handle_rate_limit(resp) | |
| continue | |
| if 500 <= resp.status_code < 600: | |
| backoff = 2 ** attempt | |
| print(f" got {resp.status_code}, backing off {backoff}s", file=sys.stderr) | |
| time.sleep(backoff) | |
| continue | |
| # Non-retryable error | |
| resp.raise_for_status() | |
| resp.raise_for_status() | |
| return resp # unreachable | |
| def iter_stargazers(session: requests.Session, repo: str) -> Iterator[dict]: | |
| """Yield {starred_at, user_login, user_id} for every stargazer.""" | |
| url = f"{API_ROOT}/repos/{repo}/stargazers" | |
| for page in range(1, MAX_PAGES + 1): | |
| resp = get_with_retry(session, url, {"per_page": PER_PAGE, "page": page}) | |
| rows = resp.json() | |
| if not rows: | |
| return | |
| for row in rows: | |
| # With the star+json Accept header, each row is {starred_at, user: {...}} | |
| user = row.get("user") or {} | |
| yield { | |
| "starred_at": row.get("starred_at", ""), | |
| "user_login": user.get("login", ""), | |
| "user_id": user.get("id", ""), | |
| } | |
| if len(rows) < PER_PAGE: | |
| return | |
| # Light pacing to be polite even when not rate-limited | |
| time.sleep(0.05) | |
| def main() -> int: | |
| if len(sys.argv) < 2: | |
| print("Usage: python fetch_stargazers.py <owner/repo> [output.csv]", file=sys.stderr) | |
| return 2 | |
| repo = sys.argv[1] | |
| if "/" not in repo: | |
| print(f"Repo must be in 'owner/repo' format, got: {repo}", file=sys.stderr) | |
| return 2 | |
| out_path = sys.argv[2] if len(sys.argv) >= 3 else f"{repo.replace('/', '_')}_stargazers.csv" | |
| token = os.environ.get("GITHUB_AUTH") | |
| if not token: | |
| print("GITHUB_AUTH environment variable is not set", file=sys.stderr) | |
| return 2 | |
| session = make_session(token) | |
| print(f"Fetching stargazers for {repo} -> {out_path}", file=sys.stderr) | |
| count = 0 | |
| with open(out_path, "w", newline="", encoding="utf-8") as f: | |
| writer = csv.DictWriter(f, fieldnames=["starred_at", "user_login", "user_id"]) | |
| writer.writeheader() | |
| for row in iter_stargazers(session, repo): | |
| writer.writerow(row) | |
| count += 1 | |
| if count % 500 == 0: | |
| print(f" {count} stargazers...", file=sys.stderr) | |
| print(f"Done. Wrote {count} rows to {out_path}", file=sys.stderr) | |
| if count >= MAX_PAGES * PER_PAGE: | |
| print( | |
| f"WARNING: hit GitHub's pagination cap ({MAX_PAGES * PER_PAGE}). " | |
| "Results may be truncated.", | |
| file=sys.stderr, | |
| ) | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment