Created
September 4, 2015 23:47
-
-
Save SamPenrose/584e9f9e65022b7fa847 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"nbformat_minor": 0, "cells": [{"source": "This notebook records an experiment with using Accumulators, rather than\nexplicitly returned values, to tally FHR statistics.\n\nThe notebook does NOT establish that the approach is a good idea, or even\nthat it performs better.\n\nThe code is WIP for a v2/v4 comparison focused on 41 beta.", "cell_type": "markdown", "metadata": {}}, {"execution_count": 1, "cell_type": "code", "source": "v2_path = 's3n://mozillametricsfhrsamples/beta/part-r-00001'\nv2_file = sc.sequenceFile(v2_path)", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 12, "cell_type": "code", "source": "import ujson as json\nfrom numbers import Number\nV2_CRASH_KEYS = set(['main-crash', 'main-hang', 'plugin-crash', 'plugin-hang'])\nWINDOW_START = '2015-08-08'\nWINDOW_END = '2015-08-30'\nAN_HOUR = 3600.0\nCHANNEL = 'beta'\nVERSION = '41.0'\n\ndef make_walker():\n counters = {\n 'beta_41': sc.accumulator(0),\n 'crashes': sc.accumulator(0),\n 'yahoo': sc.accumulator(0),\n 'hours': sc.accumulator(0.0),\n 'default': sc.accumulator(0),\n 'parsing_errors': sc.accumulator(0)\n }\n\n def descend(d):\n '''\n Inspect a $data$days blob.\n '''\n searches = d.get('org.mozilla.searches.counts', {})\n for k in searches:\n if 'yahoo' in k.lower():\n value = searches[k]\n if isinstance(value, Number):\n counters['yahoo'].add(value)\n crash_dict = d.get('org.mozilla.crashes.crashes', {})\n crashes = sum([v for k, v in crash_dict.items() if k in V2_CRASH_KEYS])\n if crashes:\n counters['crashes'].add(crashes)\n if d.get('org.mozilla.appInfo.appinfo', {}).get('isDefaultBrowser', 0):\n counters['default'].add(1)\n seconds = sum(d.get('org.mozilla.appSessions.previous', {}).get(\n 'cleanTotalTime', [0]))\n if seconds > 0:\n hours = seconds/AN_HOUR\n counters['hours'].add(hours) \n \n def walk(tup):\n '''\n Process a single value from the v2 sequenceFile.\n '''\n try:\n cid, ping = tup\n d = json.loads(ping)\n # I suspect I should be checking each day-blob.\n info = d.get('geckoAppInfo')\n if info:\n c = info.get('updateChannel')\n v = info.get('version')\n if (c, v) != (CHANNEL, VERSION):\n return\n else:\n return\n days = d.get('data', {}).get('days', {})\n in_window = []\n for date, blob in days.items():\n if date < WINDOW_START:\n continue\n if date > WINDOW_END:\n continue\n in_window.append(blob)\n if not in_window:\n return\n counters['beta_41'].add(1)\n for day in in_window:\n descend(day)\n except Exception:\n counters['parsing_errors'].add(1)\n return walk, counters", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 13, "cell_type": "code", "source": "walk, counters = make_walker()", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 14, "cell_type": "code", "source": "run = v2_file.map(walk).collect()\nfor name, obj in counters.items():\n print name, obj.value", "outputs": [{"output_type": "stream", "name": "stdout", "text": "crashes 11052\ndefault 79038\nyahoo 9901\nhours 683967.951667\nparsing_errors 0\nbeta_41 12736\n"}], "metadata": {"collapsed": false, "trusted": true}}], "nbformat": 4, "metadata": {"kernelspec": {"display_name": "Python 2", "name": "python2", "language": "python"}, "language_info": {"mimetype": "text/x-python", "nbconvert_exporter": "python", "version": "2.7.9", "name": "python", "file_extension": ".py", "pygments_lexer": "ipython2", "codemirror_mode": {"version": 2, "name": "ipython"}}}} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment