|
#!/usr/bin/env python3 |
|
|
|
import argparse |
|
from os import path |
|
import requests |
|
import time |
|
import json |
|
|
|
|
|
def get_repos(path: str, headers: {}) -> (bool, int): |
|
res = requests.get(path, headers=headers) |
|
if res.status_code != 200: |
|
raise Exception(f"""Request failed {res.status_code}, {res.content}""") |
|
|
|
data = res.json() |
|
|
|
for item in data: |
|
if item is None: |
|
write_debug_logs(res) |
|
break |
|
|
|
try: |
|
id = item["id"] |
|
full_name = item["full_name"] |
|
except KeyError: |
|
write_item_to_file(f"Either id or full_name does not exist in response: {item}") |
|
continue |
|
|
|
# Strip any potential whitespaces before checking for empty string. |
|
if str(id).strip() == "" or full_name.strip() == "": |
|
write_item_to_file(f"Either id or full_name is an empty string in response: {item}") |
|
continue |
|
|
|
# Make sure at least one character in either id or full_name is alpha |
|
# numeric. It probably is, but we want to catch any weirdness here. |
|
if not at_least_one_char_isalnum(str(id)) or not at_least_one_char_isalnum(full_name): |
|
write_item_to_file(f"Either id or full_name does not have any alpha-numeric chars {item}") |
|
|
|
nextSince = 0 |
|
if len(data) > 0: |
|
nextSince = data[-1]["id"] |
|
|
|
# returning hasNextPage, nextSince |
|
return len(data) == 100, nextSince |
|
|
|
|
|
def get_public_repos(url: str, headers: {}, since: int): |
|
print("Getting public repos") |
|
counter = 1 |
|
while True: |
|
# Sleep for 1 second between every 10 requests to not abuse the API too fast. |
|
if counter % 10 == 0: |
|
time.sleep(1) |
|
|
|
# Print progress every 10000 repos (100 requests) |
|
if counter % 100 == 0: |
|
print(f"Total repos processed {counter*100}, now getting repos since ID {since}...") |
|
|
|
endpoint = path.join(url, f"repositories?per_page=100&since={since}") |
|
hasNextPage, nextSince = get_repos(endpoint, headers) |
|
if not hasNextPage: |
|
break |
|
|
|
since = nextSince |
|
counter += 1 |
|
|
|
|
|
def get_affiliated_repos(url: str, headers: {}, page: int): |
|
print("Getting affiliated repos for user") |
|
counter = 1 |
|
while True: |
|
# Sleep for 1 second between every 10 requests to not abuse the API too fast. |
|
if counter % 10 == 0: |
|
time.sleep(1) |
|
|
|
# Print progress every 10000 repos (100 requests) |
|
if counter % 100 == 0: |
|
print(f"Total repos processed {counter*100}, now getting repos from page {page}...") |
|
|
|
endpoint = path.join( |
|
url, |
|
f"user/repos?sort=created&visibility=all&page={page}&per_page=100", |
|
) |
|
|
|
hasNextPage, nextSince = get_repos(endpoint, headers) |
|
if not hasNextPage: |
|
break |
|
|
|
page += 1 |
|
counter += 1 |
|
|
|
|
|
def write_debug_logs(response: requests.Response): |
|
data = { |
|
"status_code": response.status_code, |
|
"headers": dict(response.headers), |
|
"url": response.url, |
|
"content": str(response.content), |
|
"response_json": response.json(), |
|
} |
|
|
|
with open("debug-github-repos.json", "a") as f: |
|
json.dump(data, f) |
|
|
|
|
|
def write_item_to_file(item: str): |
|
with open("faulty-repos", "a") as f: |
|
f.write(item) |
|
f.write("\n") |
|
|
|
|
|
def at_least_one_char_isalnum(s: str): |
|
for c in s: |
|
if c.isalnum(): |
|
return True |
|
|
|
return False |
|
|
|
|
|
def main(args: argparse.Namespace): |
|
headers = { |
|
"Accept": "application/vnd.github.jean-grey-preview+json,application/vnd.github.mercy-preview+json,application/vnd.github.machine-man-preview+json", |
|
"Authorization": "Token " + args.token, |
|
} |
|
|
|
# If neither of --affiliated and --public is set, fetch both. |
|
if args.affiliated is None and args.public is None: |
|
args.affiliated = True |
|
args.public = True |
|
|
|
page = 0 |
|
if args.affiliated_page is not None and len(args.affiliated_page) == 1: |
|
page = args.affiliated_page[0] |
|
|
|
if args.affiliated: |
|
get_affiliated_repos(args.url, headers, page) |
|
|
|
since = 0 |
|
if args.public_since is not None and len(args.public_since) == 1: |
|
since = args.public_since[0] |
|
|
|
if args.public: |
|
get_public_repos(args.url, headers, since) |
|
|
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("url", type=str, help="URL of the API, example: https://api.github.com/v3/") |
|
parser.add_argument("token", type=str, help="Secret token") |
|
|
|
parser.add_argument("--affiliated", type=bool, nargs="?", help="Optionally fetch affiliated repos") |
|
parser.add_argument("--public", type=bool, nargs="?", help="Optionally fetch public repos") |
|
|
|
parser.add_argument("--affiliated-page", type=int, nargs=1, help="From which page (100 per page) should we start fetching affiliated repos?") |
|
parser.add_argument("--public-since", type=int, nargs=1, help="Since which repo should we start fetching?") |
|
|
|
args = parser.parse_args() |
|
# print(args) |
|
main(args) |