Created
July 5, 2023 14:32
-
-
Save pfmoore/303f42bd713e6f90c00b093b8c31e9e8 to your computer and use it in GitHub Desktop.
Download metadata files from PyPI
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hashlib | |
import json | |
from multiprocessing.dummy import Pool | |
from pathlib import Path | |
import urllib3 | |
DOWNLOAD_CACHE = Path("DownloadedMetadata") | |
class Downloader: | |
def __init__(self): | |
self.pool_mgr = urllib3.PoolManager(maxsize=10) | |
def download(self, filename: str, url: str): | |
target = DOWNLOAD_CACHE / filename | |
resp = self.pool_mgr.request("GET", url) | |
target.write_bytes(resp.data) | |
def already_downloaded(filename: str, hashes: bool | dict[str, str]): | |
cache_entry = DOWNLOAD_CACHE / filename | |
if not cache_entry.exists(): | |
return False | |
# print(f"{filename} downloaded") | |
if isinstance(hashes, dict): | |
data = cache_entry.read_bytes() | |
for name, hash in hashes.items(): | |
existing_hash = hashlib.new(name, data).hexdigest() | |
if existing_hash != hash: | |
print(f"{filename}: {name} hash does not match") | |
return False | |
return True | |
def get_metadata_list(pypi: str): | |
print(f"Reading data from {pypi}... ", end="", flush=True) | |
with open(pypi, "rb") as f: | |
data = json.load(f) | |
print("OK", flush=True) | |
for project in data["projects"]: | |
for file in project.get("files", []): | |
metadata = file.get("data-dist-info-metadata", False) | |
filename = file["filename"] + ".metadata" | |
url = file["url"] + ".metadata" | |
if metadata and not already_downloaded(filename, metadata): | |
yield filename, url | |
if __name__ == "__main__": | |
import sys | |
d = Downloader() | |
to_fetch = list(get_metadata_list(sys.argv[1])) | |
print(f"{len(to_fetch)} downloads still to complete") | |
#to_fetch = to_fetch[:10000] | |
with Pool() as p: | |
results = p.starmap(d.download, to_fetch) | |
print(f"{len(results)} files downloaded") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment