Last active
March 14, 2016 15:23
-
-
Save Uberi/2e97bd029eb5783967b7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Unable to parse whitelist (/home/hadoop/anaconda2/lib/python2.7/site-packages/moztelemetry/bucket-whitelist.json). Assuming all histograms are acceptable.\n" | |
] | |
} | |
], | |
"source": [ | |
"from datetime import datetime\n", | |
"\n", | |
"import psycopg2\n", | |
"import numpy as np\n", | |
"\n", | |
"from moztelemetry.spark import get_pings, get_pings_properties" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"SUBMISSION_DATE_RANGE = (datetime.utcnow().strftime(\"%Y%m%d\"),) * 2\n", | |
"FRACTION = 0.1\n", | |
"\n", | |
"COMPARABLE_DIMENSIONS = [\n", | |
" \"environment/build/version\",\n", | |
" \"environment/build/buildId\",\n", | |
" \"application/channel\",\n", | |
" \"application/name\",\n", | |
" \"environment/system/os/name\",\n", | |
" \"environment/system/os/version\",\n", | |
" \"environment/build/architecture\",\n", | |
" \"meta/geoCountry\",\n", | |
" \"environment/addons/activeExperiment/id\",\n", | |
" \"environment/addons/activeExperiment/branch\",\n", | |
" \"environment/settings/e10sEnabled\",\n", | |
"]\n", | |
"DIMENSION_NAMES = [\n", | |
" \"build_version\",\n", | |
" \"build_id\",\n", | |
" \"channel\",\n", | |
" \"application\",\n", | |
" \"os_name\",\n", | |
" \"os_version\",\n", | |
" \"architecture\",\n", | |
" \"country\",\n", | |
" \"experiment_id\",\n", | |
" \"experiment_branch\",\n", | |
" \"e10s_enabled\",\n", | |
"]\n", | |
"assert len(COMPARABLE_DIMENSIONS) == len(DIMENSION_NAMES)\n", | |
"\n", | |
"def compare_crashes(pings, comparable_dimensions):\n", | |
" \"\"\"Returns a PairRDD where keys are user configurations and values are Numpy arrays of the form [usage hours, main process crashes, content process crashes, plugin crashes]\"\"\"\n", | |
" ping_properties = get_pings_properties(pings, comparable_dimensions + [\n", | |
" \"payload/info/subsessionLength\",\n", | |
" \"meta/submissionDate\",\n", | |
" \"meta/reason\",\n", | |
" \"payload/keyedHistograms/SUBPROCESS_ABNORMAL_ABORT/content\",\n", | |
" \"payload/keyedHistograms/SUBPROCESS_ABNORMAL_ABORT/plugin\",\n", | |
" \"payload/keyedHistograms/SUBPROCESS_ABNORMAL_ABORT/gmplugin\",\n", | |
" ])\n", | |
" return ping_properties.map(lambda p: (\n", | |
" # the keys we want to filter based on\n", | |
" (p[\"meta/submissionDate\"],) + tuple(p[key] for key in comparable_dimensions),\n", | |
" np.array([\n", | |
" (p[\"payload/info/subsessionLength\"] or 0) / 3600.0,\n", | |
" int(p[\"meta/reason\"] == \"aborted-session\"), # main process crashes\n", | |
" p[\"payload/keyedHistograms/SUBPROCESS_ABNORMAL_ABORT/content\"] or 0, # content process crashes\n", | |
" (p[\"payload/keyedHistograms/SUBPROCESS_ABNORMAL_ABORT/plugin\"] or 0) +\n", | |
" (p[\"payload/keyedHistograms/SUBPROCESS_ABNORMAL_ABORT/gmplugin\"] or 0) # plugin crashes\n", | |
" ])\n", | |
" )).reduceByKey(lambda a, b: a + b)\n", | |
"\n", | |
"def retrieve_crash_data(sc, submission_date_range, comparable_dimensions, fraction = 0.1):\n", | |
" # get the raw data\n", | |
" normal_pings = get_pings(\n", | |
" sc,\n", | |
" submission_date=submission_date_range,\n", | |
" fraction=fraction\n", | |
" )\n", | |
" crash_pings = get_pings(\n", | |
" sc, doc_type=\"main\",\n", | |
" submission_date=submission_date_range,\n", | |
" fraction=fraction\n", | |
" ).filter(lambda p: p.get(\"meta\", {}).get(\"reason\") == \"aborted-session\")\n", | |
"\n", | |
" return normal_pings.union(crash_pings)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"conn = psycopg2.connect(database=\"aggregates\", user=\"postgres\")\n", | |
"cur = conn.cursor()\n", | |
"\n", | |
"cur.execute(\"\"\"\n", | |
"CREATE TABLE IF NOT EXISTS aggregates (\n", | |
" id serial PRIMARY KEY,\n", | |
" submission_date date,\n", | |
" build_version varchar,\n", | |
" build_id varchar,\n", | |
" channel varchar,\n", | |
" application varchar,\n", | |
" os_name varchar,\n", | |
" os_version varchar,\n", | |
" architecture varchar,\n", | |
" country varchar,\n", | |
" experiment_id varchar,\n", | |
" experiment_branch varchar,\n", | |
" e10s_enabled varchar,\n", | |
" usage_hours real,\n", | |
" main_crashes real,\n", | |
" content_crashes real,\n", | |
" plugin_crashes real\n", | |
");\n", | |
"\"\"\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# remove previous data for the selected days, if available\n", | |
"cur.execute(\n", | |
" \"\"\"DELETE FROM aggregates WHERE submission_date >= %s and submission_date <= %s\"\"\".format(\", \".join(DIMENSION_NAMES)),\n", | |
" (datetime.strptime(SUBMISSION_DATE_RANGE[0], \"%Y%m%d\").date(), datetime.strptime(SUBMISSION_DATE_RANGE[1], \"%Y%m%d\").date())\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"pings = retrieve_crash_data(sc, SUBMISSION_DATE_RANGE, COMPARABLE_DIMENSIONS, FRACTION)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"result = compare_crashes(pings, COMPARABLE_DIMENSIONS)\n", | |
"for dimension_values, crash_data in result.toLocalIterator():\n", | |
" submission_date, dimension_values = dimension_values[0], dimension_values[1:]\n", | |
" submission_date = datetime.strptime(submission_date, \"%Y%m%d\")\n", | |
" usage_hours, main_crashes, content_crashes, plugin_crashes = crash_data\n", | |
" cur.execute(\n", | |
" \"\"\"INSERT INTO aggregates(submission_date, {}, usage_hours, main_crashes, content_crashes, plugin_crashes) VALUES (%s, {}%s, %s, %s, %s)\"\"\".format(\n", | |
" \", \".join(DIMENSION_NAMES), \"%s, \" * len(DIMENSION_NAMES)\n", | |
" ),\n", | |
" (submission_date,) + dimension_values + (usage_hours, main_crashes, content_crashes, plugin_crashes)\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"conn.commit()\n", | |
"cur.close()\n", | |
"conn.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment