Created
September 2, 2023 16:05
-
-
Save pfmoore/fdcd1f6c68a1567b0a7430beb74938b7 to your computer and use it in GitHub Desktop.
PyPI downloader for py-code.org
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import subprocess | |
from concurrent.futures import ThreadPoolExecutor | |
from datetime import datetime | |
from pathlib import Path | |
from urllib.request import urlopen | |
REPOSITORIES = "https://github.com/pypi-data/data/raw/main/stats/repositories.json" | |
with urlopen(REPOSITORIES) as f: | |
repo_data = json.load(f) | |
def clone_repo(repo, repo_dir): | |
index = repo["index"] | |
name = repo["name"] | |
loc = str(repo_dir / name) | |
url = repo["url"] | |
start = datetime.now() | |
print(f"{name}: Started {start}") | |
proc = subprocess.run( | |
["git", "clone", url, loc], | |
capture_output=True, | |
text=True, | |
) | |
end = datetime.now() | |
duration = datetime.now() - start | |
print(f"{name}: Ended {end} ({duration})") | |
with open("output/" + name + ".output.txt", "w", encoding="utf-8") as f: | |
print(f"{index}. {name}: {url}", file=f) | |
print(f"Duration: {duration}", file=f) | |
print(f"Return code: {proc.returncode}", file=f) | |
print("\nOutput:\n" + proc.stdout, file=f) | |
print("\nErrors:\n" + proc.stderr, file=f) | |
repo_dir = Path("repos") | |
repo_dir.mkdir(exist_ok=True, parents=True) | |
with ThreadPoolExecutor() as executor: | |
results = executor.map( | |
lambda r: clone_repo(r, repo_dir), | |
sorted(repo_data, key=lambda r: r["index"])[200:] | |
) | |
# Then, do | |
# dir .\repos\pypi-mirro* | % { git -C "$_" config --local core.longpaths true } | |
# To fetch new data: | |
# dir .\repos\pypi-mirro* | Foreach-Object -Parallel { git -C "$_" fetch } | |
# Takes about 30s (2 min without -Parallel) | |
# To make object lists: | |
# dir .\repos\pypi-mirro* | Foreach-Object -Parallel { git -C "$_" rev-list --objects --all | Out-File -Encoding UTF8 (Join-Path objects $_.name)} | |
# (takes about 50 minutes) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment