Skip to content

Instantly share code, notes, and snippets.

@gamozolabs
Created September 17, 2017 09:33
Show Gist options
  • Save gamozolabs/282748fc925afdafcc4e892c8d773aca to your computer and use it in GitHub Desktop.
Save gamozolabs/282748fc925afdafcc4e892c8d773aca to your computer and use it in GitHub Desktop.
Mirror all of crates.io
import glob, json, os, sys, urlparse, hashlib, urllib
CRATES_IO_GIT = "https://github.com/rust-lang/crates.io-index"
DOWNLOAD_BASE = "https://crates.io/api/v1/crates"
def parse_index():
if os.path.exists("crates.io-index"):
print "Updating crates.io-index git repo"
os.system("cd crates.io-index ; git pull")
else:
print "Cloning crates.io-index git repo for the first time"
os.system("git clone %s" % CRATES_IO_GIT)
# First make sure config.json exists
assert os.path.exists(os.path.join("crates.io-index", "config.json")),\
"Expected config.json file in root of crates.io index"
# Now parse the json and grab the "dl" record from it
with open(os.path.join("crates.io-index", "config.json"), "rb") as fd:
j = json.load(fd)
assert "dl" in j, "Expected download URL base in config.json"
# This assertion is very strict. We expect that the download API does not
# change. If it does, we probably should recheck to make sure this script
# is correct. If the version or url changes here we will fail.
assert DOWNLOAD_BASE == j["dl"], \
"Crates.io index uses unexpected download base"
downloads = []
# Create a listing of all the URLs to download and the local path to download
print "Processing JSON of crates.io indicies, this may take ~10 seconds"
for fn in glob.iglob("crates.io-index/*/*/*"):
with open(fn, "rb") as fd:
buf = fd.read()
# There is a new json record on each line
for line in buf.splitlines():
j = json.loads(line)
assert "name" in j
assert "vers" in j
assert "cksum" in j
dl_url = DOWNLOAD_BASE + "/%s/%s/download" % (j["name"], j["vers"])
# Local path to download to. Do some safety checks to make sure
# the path remains jailed
local_path = os.path.abspath(urlparse.urlparse(dl_url).path[1:])
expect_dir = os.path.abspath(os.path.join(os.getcwd(), "api", "v1", "crates"))
assert local_path.startswith(expect_dir),\
"Download path seems to go up a directory :O"
downloads.append((dl_url, local_path, j["cksum"]))
return downloads
def mirror(download, verify):
logfd = open("log.txt", "ab")
logfd.write("==== Starting new session =====\n")
# Some stats
verified = 0
badhash = 0
downloaded = 0
cached = 0
downloads = parse_index()
ii = 0
for dl_url, local_path, cksum in downloads:
ii = ii + 1
pct = float(ii) / float(len(downloads))
print "[%8d of %8d (%8.4f)] %100s -> %100s" % (ii, len(downloads), pct, dl_url, local_path)
# Download if requested and not already downloaded
if download and not os.path.exists(local_path):
# Create the directories
if not os.path.exists(os.path.dirname(local_path)):
os.makedirs(os.path.dirname(local_path))
# Download the file :)
urllib.urlretrieve(dl_url, local_path)
downloaded += 1
if os.path.exists(local_path):
cached += 1
# Verify if requested and file exists
if verify and os.path.exists(local_path):
our_hash = hashlib.sha256(open(local_path, "rb").read()).hexdigest()
if our_hash != cksum:
# If verify failed, remove the file
logfd.write("Hash mismatch on %s (has %s, exp %s)\n" % (local_path, our_hash, cksum))
os.remove(local_path)
badhash += 1
else:
verified += 1
print "Stats:"
print " Downloaded: %8d" % downloaded
print " Cached: %8d" % cached
print " Missing: %8d" % (len(downloads) - cached)
print " Invalid hash: %8d" % badhash
print " Verified: %8d" % verified
if cached == len(downloads) and verified == len(downloads):
print "Successfully downloaded and verified all files :D"
elif cached != len(downloads):
print "Not all files have been downloaded"
elif verified != len(downloads):
print "Not all files have been verified"
logfd.close()
download = False
verify = False
for arg in sys.argv[1:]:
if arg == "download":
download = True
elif arg == "verify":
verify = True
else:
print "Usage: mirror.py [download] [verify]"
mirror(download, verify)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment