Created
September 17, 2017 09:33
-
-
Save gamozolabs/282748fc925afdafcc4e892c8d773aca to your computer and use it in GitHub Desktop.
Mirror all of crates.io
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob, json, os, sys, urlparse, hashlib, urllib | |
CRATES_IO_GIT = "https://github.com/rust-lang/crates.io-index" | |
DOWNLOAD_BASE = "https://crates.io/api/v1/crates" | |
def parse_index(): | |
if os.path.exists("crates.io-index"): | |
print "Updating crates.io-index git repo" | |
os.system("cd crates.io-index ; git pull") | |
else: | |
print "Cloning crates.io-index git repo for the first time" | |
os.system("git clone %s" % CRATES_IO_GIT) | |
# First make sure config.json exists | |
assert os.path.exists(os.path.join("crates.io-index", "config.json")),\ | |
"Expected config.json file in root of crates.io index" | |
# Now parse the json and grab the "dl" record from it | |
with open(os.path.join("crates.io-index", "config.json"), "rb") as fd: | |
j = json.load(fd) | |
assert "dl" in j, "Expected download URL base in config.json" | |
# This assertion is very strict. We expect that the download API does not | |
# change. If it does, we probably should recheck to make sure this script | |
# is correct. If the version or url changes here we will fail. | |
assert DOWNLOAD_BASE == j["dl"], \ | |
"Crates.io index uses unexpected download base" | |
downloads = [] | |
# Create a listing of all the URLs to download and the local path to download | |
print "Processing JSON of crates.io indicies, this may take ~10 seconds" | |
for fn in glob.iglob("crates.io-index/*/*/*"): | |
with open(fn, "rb") as fd: | |
buf = fd.read() | |
# There is a new json record on each line | |
for line in buf.splitlines(): | |
j = json.loads(line) | |
assert "name" in j | |
assert "vers" in j | |
assert "cksum" in j | |
dl_url = DOWNLOAD_BASE + "/%s/%s/download" % (j["name"], j["vers"]) | |
# Local path to download to. Do some safety checks to make sure | |
# the path remains jailed | |
local_path = os.path.abspath(urlparse.urlparse(dl_url).path[1:]) | |
expect_dir = os.path.abspath(os.path.join(os.getcwd(), "api", "v1", "crates")) | |
assert local_path.startswith(expect_dir),\ | |
"Download path seems to go up a directory :O" | |
downloads.append((dl_url, local_path, j["cksum"])) | |
return downloads | |
def mirror(download, verify): | |
logfd = open("log.txt", "ab") | |
logfd.write("==== Starting new session =====\n") | |
# Some stats | |
verified = 0 | |
badhash = 0 | |
downloaded = 0 | |
cached = 0 | |
downloads = parse_index() | |
ii = 0 | |
for dl_url, local_path, cksum in downloads: | |
ii = ii + 1 | |
pct = float(ii) / float(len(downloads)) | |
print "[%8d of %8d (%8.4f)] %100s -> %100s" % (ii, len(downloads), pct, dl_url, local_path) | |
# Download if requested and not already downloaded | |
if download and not os.path.exists(local_path): | |
# Create the directories | |
if not os.path.exists(os.path.dirname(local_path)): | |
os.makedirs(os.path.dirname(local_path)) | |
# Download the file :) | |
urllib.urlretrieve(dl_url, local_path) | |
downloaded += 1 | |
if os.path.exists(local_path): | |
cached += 1 | |
# Verify if requested and file exists | |
if verify and os.path.exists(local_path): | |
our_hash = hashlib.sha256(open(local_path, "rb").read()).hexdigest() | |
if our_hash != cksum: | |
# If verify failed, remove the file | |
logfd.write("Hash mismatch on %s (has %s, exp %s)\n" % (local_path, our_hash, cksum)) | |
os.remove(local_path) | |
badhash += 1 | |
else: | |
verified += 1 | |
print "Stats:" | |
print " Downloaded: %8d" % downloaded | |
print " Cached: %8d" % cached | |
print " Missing: %8d" % (len(downloads) - cached) | |
print " Invalid hash: %8d" % badhash | |
print " Verified: %8d" % verified | |
if cached == len(downloads) and verified == len(downloads): | |
print "Successfully downloaded and verified all files :D" | |
elif cached != len(downloads): | |
print "Not all files have been downloaded" | |
elif verified != len(downloads): | |
print "Not all files have been verified" | |
logfd.close() | |
download = False | |
verify = False | |
for arg in sys.argv[1:]: | |
if arg == "download": | |
download = True | |
elif arg == "verify": | |
verify = True | |
else: | |
print "Usage: mirror.py [download] [verify]" | |
mirror(download, verify) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment