Last active
December 19, 2016 16:22
-
-
Save georgf/c7f73c514370f96e477fc018070f66ac to your computer and use it in GitHub Desktop.
Validate event telemetry on Nightly
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# ## Bug 1303044 - Validate engagement measurements on Beta | |
# In[1]: | |
import ujson as json | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import numpy as np | |
import plotly.plotly as py | |
import datetime as dt | |
from uuid import UUID | |
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history | |
get_ipython().magic(u'pylab inline') | |
# We get all the pings on Nightly, after bug 1316281 landed. | |
# In[2]: | |
def dedupe(pings): | |
return pings.map(lambda p: (p["meta/documentId"], p)) .reduceByKey(lambda a, b: a) .values() | |
def map_fields(p): | |
# We need to do manual mapping instead of get_pings_properties() due to bug 1318326. | |
mapped = {} | |
mapped['meta/clientId'] = p.get('meta', {}).get('clientId') | |
mapped['meta/documentId'] = p.get('meta', {}).get('documentId') | |
mapped['meta/submissionDate'] = p.get('meta', {}).get('submissionDate') | |
env = p.get('environment', {}) | |
mapped['environment/system/os'] = env.get('system', {}).get('os') | |
mapped['environment/system/version'] = env.get('system', {}).get('version') | |
payload = p.get('payload', {}) | |
mapped['payload/info/reason'] = payload.get('info', {}).get('reason') | |
# Events | |
parent_process = payload.get('processes', {}).get('parent', {}) | |
mapped['payload/processes/parent/events'] = parent_process.get('events') | |
# Handle the keyed histogram SEARCH_COUNT, as get_pings_properties is not doing it right. | |
mapped['payload/keyedHistograms/SEARCH_COUNTS'] = payload.get('keyedHistograms', {}).get('SEARCH_COUNTS') | |
return mapped | |
def filter(pings): | |
subset = pings.map(map_fields) | |
return dedupe(subset) | |
all_pings = filter(get_pings(sc, | |
app="Firefox", | |
channel="nightly", | |
doc_type="main", | |
submission_date=("20161203", "20161220"), | |
build_id=("20161203000000", "20161220999999"), # Post bug 1316281. | |
fraction=1.0)) | |
# In[3]: | |
total_clients = all_pings.map(lambda p: p["meta/clientId"]).distinct().count() | |
total_clients | |
# In[4]: | |
total_pings = all_pings.count() | |
total_pings | |
# ### Type-check the payloads and events. | |
# Make sure each ping has an events section and the contained events have the right format. | |
# In[5]: | |
def events_check(p): | |
events = p["payload/processes/parent/events"] | |
if events is None: | |
return ("events section is None", p) | |
if events == {}: | |
return ("events section is {}", p) | |
if events == []: | |
return ("events section is []", p) | |
if not isinstance(events, list): | |
return ("events section is not a list", p) | |
for e in events: | |
if not isinstance(e, list): | |
return ("event entry is not a list", p) | |
# Length check. | |
if len(e) < 4: | |
return ("event entry count < 4", p) | |
if len(e) > 6: | |
return ("event entry count < 4", p) | |
# Type checks. | |
if not isinstance(e[0], (int, long)): | |
return ("event entry 0 is not int or long", p) | |
for i in range(1, 3): | |
if not isinstance(e[i], basestring): | |
return ("event entry "+str(i)+" is not basestring", p) | |
if len(e) >= 5 and not isinstance(e[4], basestring) and not e[4] == None: | |
return ("event entry 4 is not basestring or None", p) | |
if (len(e) == 6) and (not isinstance(e[5], dict)) and (not e[5] == None): | |
return ("event entry 5 is not dict or None", p) | |
sources = ["about_home", "about_newtab", "contextmenu", "oneoff", | |
"suggestion", "alias", "enter", "searchbar", "urlbar"] | |
if e[1] != "navigation" or e[2] != "search" or not (e[3] in sources): | |
return ("unexpected event", p) | |
return ("", p) | |
checked_pings = all_pings.map(events_check) | |
result_counts = checked_pings.countByKey() | |
result_counts | |
# So far so reasonable. That low ratio of a missing events section can be blamed on custom builds or so. | |
# | |
# Note that empty events arrays show up as `{}` instead of as `[]` (see [bug 1323964](https://bugzilla.mozilla.org/show_bug.cgi?id=1323964)), so we can't strictly typecheck that at the moment. | |
# ### Let's cross-check events with search counts | |
# In[6]: | |
def check_counts(p): | |
""" This maps the counts in SEARCH_COUNTS and the search events | |
to a tuple with two dictionaries, one with the counts from SEARCH_COUNTS | |
and the other for the search events, i.e. ({'urlbar': 2}, {'urlbar': 2}). | |
It then checks for mismatches in the counts. | |
""" | |
search_counts = p.get("payload/keyedHistograms/SEARCH_COUNTS", {}) or {} | |
events = p.get("payload/processes/parent/events", {}) or {} | |
# Count the searches in SEARCH_COUNTS. | |
cnt_hist = {} | |
for k in search_counts: | |
ui_source = k.split('.')[-1] | |
cnt_hist[ui_source] = cnt_hist.get(ui_source, 0) + search_counts[k].get('sum', 0) | |
# Count the searches in the search events. | |
cnt_events = {} | |
for e in events: | |
ui_source = e[3] | |
cnt_events[ui_source] = cnt_events.get(ui_source, 0) + 1 | |
# The probes have different names for some keys. Account for that. | |
def translate_key(k): | |
if k == 'abouthome': | |
return 'about_home' | |
elif k == 'newtab': | |
return 'about_newtab' | |
return k | |
# See if each entry matches up. | |
for k in cnt_hist: | |
e_key = translate_key(k) | |
if e_key not in cnt_events: | |
return (u"{} is not being recorded by events.".format(e_key), p) | |
if cnt_hist[k] != cnt_events[e_key]: | |
return ("{} doesn't match.".format(k), p) | |
# Make sure the sum matches up as well. | |
print cnt_hist, cnt_events | |
sum_search_counts = sum(cnt_hist.values()) | |
sum_search_events = sum(cnt_events.values()) | |
if sum_search_counts != sum_search_events: | |
return ("The sums don't match!", p) | |
return ("ok", p) | |
# In[7]: | |
mismatches_by_key = all_pings.map(check_counts).countByKey() | |
for k, v in sorted(mismatches_by_key.iteritems(), key=lambda t: -t[1]): | |
print(u"{} - Ratio {:.4f}".format(k, v / float(total_pings))) | |
# In[13]: | |
event_blobs = all_pings.map(lambda p: p.get("payload/processes/parent/events", {}) or {}) .filter(lambda e: len(e) > 0) .take(30) | |
# In[14]: | |
for events in event_blobs: | |
print "" | |
for e in events: | |
print e | |
# In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment