Skip to content

Instantly share code, notes, and snippets.

@willkg
Created March 17, 2017 15:14
Show Gist options
  • Save willkg/25e28570fd8c95537dbd7f9e2855c7c8 to your computer and use it in GitHub Desktop.
Save willkg/25e28570fd8c95537dbd7f9e2855c7c8 to your computer and use it in GitHub Desktop.
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""To run:
1. ``mkvirtualenv --python=/usr/bin/python3 crashids``
2. ``pip install -r requirements.txt``
3. ``python get_ids_1337688.py <ACCESS_KEY> <SECRET_ACCESS_KEY>``
This gets crash ids before and after a certain build, pulls down their
``upload_file_minidump`` files from s3, then compares all the files and prints
out rough analysis.
"""
import logging
import os
import pathlib
import sys
import boto3
from botocore.client import Config
import requests
logging.basicConfig(level=logging.DEBUG)
BUCKET_NAME = 'org.mozilla.crash-stats.production.crashes'
REGION = 'us-west-2'
RESULTS = 100
SUPERSEARCH_URL = "https://crash-stats.mozilla.org/api/SuperSearch/"
HIT_TMPL = '%(date)-32s %(product)-10s %(version)-10s %(uuid)-20s'
FILENAME_TMPL = 'v1/dump/%(crashid)s'
def build_s3_client(access_key, secret_access_key):
session = boto3.session.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_access_key)
client = session.client(
service_name='s3',
region_name=REGION,
config=Config(s3={'addression_style': 'path'})
)
return client
def get_by_query(query):
url = SUPERSEARCH_URL + '?' + '&'.join(query)
return requests.get(url)
def print_results(results):
"""Print the results of the SuperSearch query
This helps to make sure the query is correct and we're getting back
appropriate results.
"""
print(HIT_TMPL % {'date': 'date', 'product': 'product', 'version': 'version', 'uuid': 'uuid'})
for res in results:
print(HIT_TMPL % res)
def fetch_and_save(s3_client, dir_, hits):
"""Fetch the dumps for the specified crashes and save them in specified
directory
"""
if not os.path.exists(dir_):
os.makedirs(dir_)
print(dir_)
print_results(hits)
for hit in hits:
crashid = hit['uuid']
fn = os.path.join(dir_, crashid)
if os.path.exists(fn):
print('already exists %s' % fn)
continue
print('fetching %s upload_file_minidump...' % crashid)
with open(fn, 'wb') as fp:
resp = s3_client.get_object(
Bucket=BUCKET_NAME,
Key=FILENAME_TMPL % {'crashid': crashid}
)
fp.write(resp['Body'].read())
def analyze(dir_):
"""Analyze the files in the directory and print stats to stdout"""
path = pathlib.Path(dir_)
files = [(f, f.stat().st_size) for f in path.glob('**/*') if f.is_file()]
files.sort(key=lambda part: part[1])
print(dir_)
print(' Number of files: %10d' % len(files))
print(' Average size: %10d' % (sum([f[1] for f in files]) / len(files)))
print(' Median size: %10d' % files[int(len(files) / 2)][1])
print(' 95%% size: %10d' % files[int(len(files) * 0.95)][1])
print(' Max size: %10d' % files[-1][1])
def main(args):
access_key, secret_access_key = args
# Build an S3 client which we'll use to pull down dump files
s3_client = build_s3_client(access_key, secret_access_key)
# Get all crash ids that:
# - product: Firefox
# - channel: nightly
# - OS: windows
# - build id < 20170209030214 vs. build id >= 20170209030214
# - 2/1 to 2/28
# We get some crashes that match our criteria per day for a 10 day range.
# This is the "before the change" set.
for day in range(1, 10):
date = '2017-02-%02d' % day
before_query = (
'product=Firefox',
'release_channel=nightly',
'platform=Windows NT',
'build_id=<20170209030214',
'date=>' + date,
'date=<2017-02-%02d' % (day + 1),
'_results_number=%d' % RESULTS,
)
resp = get_by_query(before_query)
hits = resp.json()['hits']
fetch_and_save(s3_client, os.path.join('.', 'before', date), hits)
# We get some crashes that match our criteria per day for a 10 day range.
# This is the "after the change" set.
for day in range(10, 19):
date = '2017-02-%02d' % day
after_query = (
'product=Firefox',
'release_channel=nightly',
'platform=Windows NT',
'build_id=>=20170209030214',
'date=>' + date,
'date=<2017-02-%02d' % (day + 1),
'_results_number=%d' % RESULTS,
)
resp = get_by_query(after_query)
hits = resp.json()['hits']
fetch_and_save(s3_client, os.path.join('.', 'after', date), hits)
# Analyze the before and after sets--these print to stdout
analyze(os.path.join('.', 'before'))
analyze(os.path.join('.', 'after'))
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment