plurch · May 14, 2026 16:44
diff --git a/fetch_stargazers.py b/fetch_stargazers.py
 """
 Fetch all stargazers for a GitHub repo with starred_at timestamps and write to CSV.

 Usage:
    python fetch_stargazers.py <owner/repo> [output.csv]

 Requires:
    GITHUB_AUTH environment variable with a GitHub token.

 Notes:
    - Uses the `application/vnd.github.star+json` Accept header to include
      `starred_at` timestamps (otherwise the API returns just the user object).
    - Paginates 100 per page. Max 40,000 stargazers returnable (GitHub's
      pagination limit of 400 pages * 100 per page). Repos with more than
      40k stars will be truncated at that boundary.
    - Honors X-RateLimit-Remaining / X-RateLimit-Reset and sleeps if needed.
    - Retries on 5xx and 429 with exponential backoff.
 """

 import csv
 import os
 import sys
 import time
 from typing import Iterator

 import requests

 API_ROOT = "https://api.github.com"
 PER_PAGE = 100
 MAX_PAGES = 400  # GitHub hard limit: 40,000 stargazers via this endpoint
 ACCEPT_HEADER = "application/vnd.github.star+json"
 MAX_RETRIES = 5


 def make_session(token: str) -> requests.Session:
    s = requests.Session()
    s.headers.update({
        "Accept": ACCEPT_HEADER,
        "Authorization": f"Bearer {token}",
        "X-GitHub-Api-Version": "2022-11-28",
        "User-Agent": "stargazer-fetcher",
    })
    return s


 def handle_rate_limit(resp: requests.Response) -> None:
    """If we're close to or hit a rate limit, sleep until reset."""
    remaining = resp.headers.get("X-RateLimit-Remaining")
    reset = resp.headers.get("X-RateLimit-Reset")
    if remaining is not None and int(remaining) == 0 and reset is not None:
        sleep_for = max(int(reset) - int(time.time()), 0) + 1
        print(f"  rate limit hit; sleeping {sleep_for}s until reset", file=sys.stderr)
        time.sleep(sleep_for)


 def get_with_retry(session: requests.Session, url: str, params: dict) -> requests.Response:
    """GET with retry on transient failures and rate-limit handling."""
    for attempt in range(MAX_RETRIES):
        resp = session.get(url, params=params, timeout=30)

        if resp.status_code == 200:
            return resp

        if resp.status_code in (403, 429):
            # 403 from GitHub on this endpoint usually means rate limit (secondary or primary)
            retry_after = resp.headers.get("Retry-After")
            if retry_after:
                sleep_for = int(retry_after) + 1
                print(f"  got {resp.status_code}, Retry-After={retry_after}s", file=sys.stderr)
                time.sleep(sleep_for)
            else:
                handle_rate_limit(resp)
            continue

        if 500 <= resp.status_code < 600:
            backoff = 2 ** attempt
            print(f"  got {resp.status_code}, backing off {backoff}s", file=sys.stderr)
            time.sleep(backoff)
            continue

        # Non-retryable error
        resp.raise_for_status()

    resp.raise_for_status()
    return resp  # unreachable


 def iter_stargazers(session: requests.Session, repo: str) -> Iterator[dict]:
    """Yield {starred_at, user_login, user_id} for every stargazer."""
    url = f"{API_ROOT}/repos/{repo}/stargazers"
    for page in range(1, MAX_PAGES + 1):
        resp = get_with_retry(session, url, {"per_page": PER_PAGE, "page": page})
        rows = resp.json()

        if not rows:
            return

        for row in rows:
            # With the star+json Accept header, each row is {starred_at, user: {...}}
            user = row.get("user") or {}
            yield {
                "starred_at": row.get("starred_at", ""),
                "user_login": user.get("login", ""),
                "user_id": user.get("id", ""),
            }

        if len(rows) < PER_PAGE:
            return

        # Light pacing to be polite even when not rate-limited
        time.sleep(0.05)


 def main() -> int:
    if len(sys.argv) < 2:
        print("Usage: python fetch_stargazers.py <owner/repo> [output.csv]", file=sys.stderr)
        return 2

    repo = sys.argv[1]
    if "/" not in repo:
        print(f"Repo must be in 'owner/repo' format, got: {repo}", file=sys.stderr)
        return 2

    out_path = sys.argv[2] if len(sys.argv) >= 3 else f"{repo.replace('/', '_')}_stargazers.csv"

    token = os.environ.get("GITHUB_AUTH")
    if not token:
        print("GITHUB_AUTH environment variable is not set", file=sys.stderr)
        return 2

    session = make_session(token)

    print(f"Fetching stargazers for {repo} -> {out_path}", file=sys.stderr)
    count = 0
    with open(out_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["starred_at", "user_login", "user_id"])
        writer.writeheader()
        for row in iter_stargazers(session, repo):
            writer.writerow(row)
            count += 1
            if count % 500 == 0:
                print(f"  {count} stargazers...", file=sys.stderr)

    print(f"Done. Wrote {count} rows to {out_path}", file=sys.stderr)
    if count >= MAX_PAGES * PER_PAGE:
        print(
            f"WARNING: hit GitHub's pagination cap ({MAX_PAGES * PER_PAGE}). "
            "Results may be truncated.",
            file=sys.stderr,
        )
    return 0


 if __name__ == "__main__":
    sys.exit(main())
	"""
	Fetch all stargazers for a GitHub repo with starred_at timestamps and write to CSV.

	Usage:
	python fetch_stargazers.py <owner/repo> [output.csv]

	Requires:
	GITHUB_AUTH environment variable with a GitHub token.

	Notes:
	- Uses the `application/vnd.github.star+json` Accept header to include
	`starred_at` timestamps (otherwise the API returns just the user object).
	- Paginates 100 per page. Max 40,000 stargazers returnable (GitHub's
	pagination limit of 400 pages * 100 per page). Repos with more than
	40k stars will be truncated at that boundary.
	- Honors X-RateLimit-Remaining / X-RateLimit-Reset and sleeps if needed.
	- Retries on 5xx and 429 with exponential backoff.
	"""

	import csv
	import os
	import sys
	import time
	from typing import Iterator

	import requests

	API_ROOT = "https://api.github.com"
	PER_PAGE = 100
	MAX_PAGES = 400 # GitHub hard limit: 40,000 stargazers via this endpoint
	ACCEPT_HEADER = "application/vnd.github.star+json"
	MAX_RETRIES = 5


	def make_session(token: str) -> requests.Session:
	s = requests.Session()
	s.headers.update({
	"Accept": ACCEPT_HEADER,
	"Authorization": f"Bearer {token}",
	"X-GitHub-Api-Version": "2022-11-28",
	"User-Agent": "stargazer-fetcher",
	})
	return s


	def handle_rate_limit(resp: requests.Response) -> None:
	"""If we're close to or hit a rate limit, sleep until reset."""
	remaining = resp.headers.get("X-RateLimit-Remaining")
	reset = resp.headers.get("X-RateLimit-Reset")
	if remaining is not None and int(remaining) == 0 and reset is not None:
	sleep_for = max(int(reset) - int(time.time()), 0) + 1
	print(f" rate limit hit; sleeping {sleep_for}s until reset", file=sys.stderr)
	time.sleep(sleep_for)


	def get_with_retry(session: requests.Session, url: str, params: dict) -> requests.Response:
	"""GET with retry on transient failures and rate-limit handling."""
	for attempt in range(MAX_RETRIES):
	resp = session.get(url, params=params, timeout=30)

	if resp.status_code == 200:
	return resp

	if resp.status_code in (403, 429):
	# 403 from GitHub on this endpoint usually means rate limit (secondary or primary)
	retry_after = resp.headers.get("Retry-After")
	if retry_after:
	sleep_for = int(retry_after) + 1
	print(f" got {resp.status_code}, Retry-After={retry_after}s", file=sys.stderr)
	time.sleep(sleep_for)
	else:
	handle_rate_limit(resp)
	continue

	if 500 <= resp.status_code < 600:
	backoff = 2 ** attempt
	print(f" got {resp.status_code}, backing off {backoff}s", file=sys.stderr)
	time.sleep(backoff)
	continue

	# Non-retryable error
	resp.raise_for_status()

	resp.raise_for_status()
	return resp # unreachable


	def iter_stargazers(session: requests.Session, repo: str) -> Iterator[dict]:
	"""Yield {starred_at, user_login, user_id} for every stargazer."""
	url = f"{API_ROOT}/repos/{repo}/stargazers"
	for page in range(1, MAX_PAGES + 1):
	resp = get_with_retry(session, url, {"per_page": PER_PAGE, "page": page})
	rows = resp.json()

	if not rows:
	return

	for row in rows:
	# With the star+json Accept header, each row is {starred_at, user: {...}}
	user = row.get("user") or {}
	yield {
	"starred_at": row.get("starred_at", ""),
	"user_login": user.get("login", ""),
	"user_id": user.get("id", ""),
	}

	if len(rows) < PER_PAGE:
	return

	# Light pacing to be polite even when not rate-limited
	time.sleep(0.05)


	def main() -> int:
	if len(sys.argv) < 2:
	print("Usage: python fetch_stargazers.py <owner/repo> [output.csv]", file=sys.stderr)
	return 2

	repo = sys.argv[1]
	if "/" not in repo:
	print(f"Repo must be in 'owner/repo' format, got: {repo}", file=sys.stderr)
	return 2

	out_path = sys.argv[2] if len(sys.argv) >= 3 else f"{repo.replace('/', '_')}_stargazers.csv"

	token = os.environ.get("GITHUB_AUTH")
	if not token:
	print("GITHUB_AUTH environment variable is not set", file=sys.stderr)
	return 2

	session = make_session(token)

	print(f"Fetching stargazers for {repo} -> {out_path}", file=sys.stderr)
	count = 0
	with open(out_path, "w", newline="", encoding="utf-8") as f:
	writer = csv.DictWriter(f, fieldnames=["starred_at", "user_login", "user_id"])
	writer.writeheader()
	for row in iter_stargazers(session, repo):
	writer.writerow(row)
	count += 1
	if count % 500 == 0:
	print(f" {count} stargazers...", file=sys.stderr)

	print(f"Done. Wrote {count} rows to {out_path}", file=sys.stderr)
	if count >= MAX_PAGES * PER_PAGE:
	print(
	f"WARNING: hit GitHub's pagination cap ({MAX_PAGES * PER_PAGE}). "
	"Results may be truncated.",
	file=sys.stderr,
	)
	return 0


	if __name__ == "__main__":
	sys.exit(main())
No results found