Created
October 9, 2020 00:54
-
-
Save willkg/02d82dcbc5cf9b00cab74992e2586819 to your computer and use it in GitHub Desktop.
Script to compress files with makecab and upload
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
October 8th, 2020. I wrote this to fix a problem where .dll and .exe files were getting | |
uploaded to symbols.mozilla.org, but hadn't been run through makecab. That prevented | |
them from getting served by symbols.mozilla.org and that broke debugging efforts. | |
I ran this script in WSL2 on Windows. It downloads files from the symbols bucket, runs | |
makecab on them, batches them up, and periodically uploads a symbols.zip file. | |
Since there are so many files involved and my Windows machine is flaky and the network | |
is flaky and life is flaky, it tries to be resilient and keeps track of progress so it | |
can pick up where it left off. | |
""" | |
import datetime | |
import os | |
import pathlib | |
import shutil | |
import subprocess | |
import sys | |
import time | |
import urllib3 | |
import zipfile | |
import requests | |
from requests.adapters import HTTPAdapter | |
from requests.packages.urllib3.util.retry import Retry | |
BUCKET_URL = "https://s3-us-west-2.amazonaws.com/org.mozilla.crash-stats.symbols-public/v1/" | |
DOWNLOAD_API = "https://symbols.mozilla.org/" | |
UPLOAD_API = "https://symbols.mozilla.org/upload/" | |
# FIXME(willkg): use your auth token | |
UPLOAD_AUTH_TOKEN = "" | |
MAKECAB = "makecab.exe" | |
DATADIR = "data/" | |
SYMDIR = "symbols/" | |
# 300mb | |
SYMBOLS_ZIP_GOOD_SIZE = 150 * 1024 * 1024 | |
class HTTPAdapterWithTimeout(HTTPAdapter): | |
def __init__(self, *args, **kwargs): | |
self._default_timeout = kwargs.pop("default_timeout", 5.0) | |
super().__init__(*args, **kwargs) | |
def send(self, *args, **kwargs): | |
kwargs["timeout"] = kwargs.get("timeout") or self._default_timeout | |
return super().send(*args, **kwargs) | |
def retry_session(): | |
session = requests.Session() | |
retries = Retry( | |
total=5, | |
read=5, | |
connect=5, | |
backoff_factor=0.2, | |
# NOTE(willkg): This is in addition to 429 | |
status_forcelist=(500, 502, 503, 504), | |
) | |
# Set user agent | |
session.headers.update({"User-Agent": "willkg/comp_and_upload"}) | |
adapter = HTTPAdapterWithTimeout(max_retries=retries, default_timeout=5.0) | |
session.mount("http://", adapter) | |
session.mount("https://", adapter) | |
return session | |
def makedir(dn): | |
if not os.path.exists(dn): | |
os.makedirs(dn) | |
def try_harder(fun): | |
def _try_harder(*args, **kwargs): | |
while True: | |
try: | |
return fun(*args, **kwargs) | |
except (urllib3.exceptions.ProtocolError, requests.exceptions.ConnectionError) as pe: | |
print(f"try_harder error {pe!r}") | |
time.sleep(1) | |
return _try_harder | |
@try_harder | |
def head_file(session, fn): | |
compressed_filename = fn[:-1] + "_" | |
url = BUCKET_URL + compressed_filename | |
# See if it's there already | |
resp = session.head(url) | |
# print(url, resp.status_code) | |
return resp.status_code == 200 | |
@try_harder | |
def download_file(session, fn, dest_fn): | |
if os.path.exists(dest_fn): | |
return True | |
url = BUCKET_URL + fn | |
resp = session.get(url) | |
if resp.status_code != 200: | |
print(">>> ERROR: GET %s %s" % (url, resp.status_code)) | |
return False | |
makedir(os.path.dirname(dest_fn)) | |
print(">>> Downloaded file: %s" % url) | |
with open(dest_fn, "wb") as fp: | |
data = resp.content | |
print(f">>> Wrote {len(data):,} bytes") | |
fp.write(resp.content) | |
return True | |
def compress_file(cache_fn, symbols_tmp_fn): | |
makedir(os.path.dirname(symbols_tmp_fn)) | |
subprocess.check_call( | |
[MAKECAB, "-D", "CompressionType=MSZIP", cache_fn, symbols_tmp_fn], | |
stdout=subprocess.DEVNULL, | |
stderr=subprocess.STDOUT | |
) | |
print(f">>> Compressed: {get_size(cache_fn):,} -> {get_size(symbols_tmp_fn):,}") | |
def build_zip_file(zip_filename, sym_dir): | |
"""Generates a ZIP file of contents of sym dir. | |
:param zip_filename: full path to zip file | |
:param sym_dir: full path to directory of SYM files | |
:returns: path to zip file | |
""" | |
with zipfile.ZipFile(zip_filename, mode="w") as fp: | |
for root, dirs, files in os.walk(sym_dir): | |
if not files: | |
continue | |
for sym_file in files: | |
full_path = os.path.join(root, sym_file) | |
arcname = full_path[len(sym_dir):] | |
fp.write( | |
full_path, | |
arcname=arcname, | |
compress_type=zipfile.ZIP_DEFLATED, | |
) | |
@try_harder | |
def upload_zip_file(session, zipfile): | |
url = UPLOAD_API | |
headers = {"auth-token": UPLOAD_AUTH_TOKEN} | |
with open("symbols.zip", "rb") as fp: | |
data = fp.read() | |
resp = session.post( | |
url, | |
headers=headers, | |
allow_redirects=False, | |
timeout=(300, 300), | |
files={"symbols.zip": data} | |
) | |
if resp.status_code != 200: | |
resp.raise_for_status() | |
def get_size(fn): | |
"""Get the size of a file. | |
:param filename: the filename to check | |
:returns: 0 if the file doesn't exist; file size otherwise | |
""" | |
if not os.path.exists(fn): | |
return 0 | |
return os.stat(fn).st_size | |
def niceify_seconds(s): | |
if s > 3600: | |
return f"{s / 60 / 60:,.0f}h" | |
if s > 60: | |
return f"{s / 60:,.0f}m" | |
return f"{s:,}s" | |
def handle_keyboard_interrupt(fun): | |
def _handle_keyboard_interrupt(*args, **kwargs): | |
try: | |
return fun(*args, **kwargs) | |
except KeyboardInterrupt: | |
print("KeyboardInterrupt!") | |
sys.exit(1) | |
return _handle_keyboard_interrupt | |
@handle_keyboard_interrupt | |
def main(): | |
skip = 0 | |
if len(sys.argv) > 1: | |
skip = int(sys.argv[1]) | |
elif os.path.exists("last_upload.txt"): | |
with open("last_upload.txt", "r") as fp: | |
skip = int(fp.read().strip()) | |
with open("uncompressed_keys.txt", "r") as fp: | |
lines = fp.readlines() | |
# Remove things from previous run of the script if they're there | |
if os.path.exists(SYMDIR): | |
shutil.rmtree(SYMDIR) | |
pathlib.Path("symbols.zip").unlink(missing_ok=True) | |
print(">>> Number of files: %s" % len(lines)) | |
total_size = 0 | |
total_files = 0 | |
start_time = time.time() | |
session = retry_session() | |
for i, line in enumerate(lines): | |
if i < skip: | |
continue | |
fn = line.strip() | |
if not fn or line.startswith("#"): | |
continue | |
print(">>> Working on %s (%d/%d)" % (fn, i, len(lines))) | |
print(">>> %s" % datetime.datetime.now()) | |
# Check if the file is there--if it is, skip it | |
if head_file(session, fn): | |
print(">>> already there--skipping") | |
continue | |
cache_fn = os.path.join(DATADIR, fn) | |
symbols_tmp_fn = os.path.join(SYMDIR, fn[:-1] + "_") | |
# If we don't have the file on disk, download it | |
if not download_file(session, fn, cache_fn): | |
continue | |
# Compress file to symbols tmp dir | |
compress_file(cache_fn, symbols_tmp_fn) | |
file_size = get_size(symbols_tmp_fn) | |
total_size += file_size | |
total_files += 1 | |
# Add file sizes up and if that's less than the good size, we can just | |
# blithly skip along without building and computing a zip file size | |
if total_size < SYMBOLS_ZIP_GOOD_SIZE: | |
print(f">>> accumulated size: {total_size:,}") | |
continue | |
# If zip file is big enough, upload it | |
build_zip_file("symbols.zip", SYMDIR) | |
symbols_zip_size = get_size("symbols.zip") | |
print(f">>> symbols.zip size: {symbols_zip_size:,}") | |
print(">>> uploading ...") | |
upload_zip_file(session, "symbols.zip") | |
# Save the line number of the last upload so we can re-start there if the script | |
# dies | |
with open("last_upload.txt", "w") as fp: | |
fp.write("%s" % i) | |
total_time = time.time() - start_time | |
est_left = niceify_seconds(((time.time() - start_time) / total_files) * (len(lines) - i)) | |
print(f"Total: {total_time:,}s Files: {i}/{len(lines)} Est left: {est_left}") | |
print("") | |
# Clean up this pass | |
total_size = 0 | |
shutil.rmtree(SYMDIR) | |
pathlib.Path("symbols.zip").unlink(missing_ok=True) | |
# If zip file is big enough, upload it | |
build_zip_file("symbols.zip", SYMDIR) | |
symbols_zip_size = get_size("symbols.zip") | |
print(f">>> symbols.zip size: {symbols_zip_size:,}") | |
print(">>> uploading ...") | |
upload_zip_file(session, "symbols.zip") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment