Skip to content

Instantly share code, notes, and snippets.

@georgf
Last active December 19, 2016 16:22
Show Gist options
  • Save georgf/c7f73c514370f96e477fc018070f66ac to your computer and use it in GitHub Desktop.
Save georgf/c7f73c514370f96e477fc018070f66ac to your computer and use it in GitHub Desktop.
Validate event telemetry on Nightly
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# ## Bug 1303044 - Validate engagement measurements on Beta
# In[1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import datetime as dt
from uuid import UUID
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history
get_ipython().magic(u'pylab inline')
# We get all the pings on Nightly, after bug 1316281 landed.
# In[2]:
def dedupe(pings):
return pings.map(lambda p: (p["meta/documentId"], p)) .reduceByKey(lambda a, b: a) .values()
def map_fields(p):
# We need to do manual mapping instead of get_pings_properties() due to bug 1318326.
mapped = {}
mapped['meta/clientId'] = p.get('meta', {}).get('clientId')
mapped['meta/documentId'] = p.get('meta', {}).get('documentId')
mapped['meta/submissionDate'] = p.get('meta', {}).get('submissionDate')
env = p.get('environment', {})
mapped['environment/system/os'] = env.get('system', {}).get('os')
mapped['environment/system/version'] = env.get('system', {}).get('version')
payload = p.get('payload', {})
mapped['payload/info/reason'] = payload.get('info', {}).get('reason')
# Events
parent_process = payload.get('processes', {}).get('parent', {})
mapped['payload/processes/parent/events'] = parent_process.get('events')
# Handle the keyed histogram SEARCH_COUNT, as get_pings_properties is not doing it right.
mapped['payload/keyedHistograms/SEARCH_COUNTS'] = payload.get('keyedHistograms', {}).get('SEARCH_COUNTS')
return mapped
def filter(pings):
subset = pings.map(map_fields)
return dedupe(subset)
all_pings = filter(get_pings(sc,
app="Firefox",
channel="nightly",
doc_type="main",
submission_date=("20161203", "20161220"),
build_id=("20161203000000", "20161220999999"), # Post bug 1316281.
fraction=1.0))
# In[3]:
total_clients = all_pings.map(lambda p: p["meta/clientId"]).distinct().count()
total_clients
# In[4]:
total_pings = all_pings.count()
total_pings
# ### Type-check the payloads and events.
# Make sure each ping has an events section and the contained events have the right format.
# In[5]:
def events_check(p):
events = p["payload/processes/parent/events"]
if events is None:
return ("events section is None", p)
if events == {}:
return ("events section is {}", p)
if events == []:
return ("events section is []", p)
if not isinstance(events, list):
return ("events section is not a list", p)
for e in events:
if not isinstance(e, list):
return ("event entry is not a list", p)
# Length check.
if len(e) < 4:
return ("event entry count < 4", p)
if len(e) > 6:
return ("event entry count < 4", p)
# Type checks.
if not isinstance(e[0], (int, long)):
return ("event entry 0 is not int or long", p)
for i in range(1, 3):
if not isinstance(e[i], basestring):
return ("event entry "+str(i)+" is not basestring", p)
if len(e) >= 5 and not isinstance(e[4], basestring) and not e[4] == None:
return ("event entry 4 is not basestring or None", p)
if (len(e) == 6) and (not isinstance(e[5], dict)) and (not e[5] == None):
return ("event entry 5 is not dict or None", p)
sources = ["about_home", "about_newtab", "contextmenu", "oneoff",
"suggestion", "alias", "enter", "searchbar", "urlbar"]
if e[1] != "navigation" or e[2] != "search" or not (e[3] in sources):
return ("unexpected event", p)
return ("", p)
checked_pings = all_pings.map(events_check)
result_counts = checked_pings.countByKey()
result_counts
# So far so reasonable. That low ratio of a missing events section can be blamed on custom builds or so.
#
# Note that empty events arrays show up as `{}` instead of as `[]` (see [bug 1323964](https://bugzilla.mozilla.org/show_bug.cgi?id=1323964)), so we can't strictly typecheck that at the moment.
# ### Let's cross-check events with search counts
# In[6]:
def check_counts(p):
""" This maps the counts in SEARCH_COUNTS and the search events
to a tuple with two dictionaries, one with the counts from SEARCH_COUNTS
and the other for the search events, i.e. ({'urlbar': 2}, {'urlbar': 2}).
It then checks for mismatches in the counts.
"""
search_counts = p.get("payload/keyedHistograms/SEARCH_COUNTS", {}) or {}
events = p.get("payload/processes/parent/events", {}) or {}
# Count the searches in SEARCH_COUNTS.
cnt_hist = {}
for k in search_counts:
ui_source = k.split('.')[-1]
cnt_hist[ui_source] = cnt_hist.get(ui_source, 0) + search_counts[k].get('sum', 0)
# Count the searches in the search events.
cnt_events = {}
for e in events:
ui_source = e[3]
cnt_events[ui_source] = cnt_events.get(ui_source, 0) + 1
# The probes have different names for some keys. Account for that.
def translate_key(k):
if k == 'abouthome':
return 'about_home'
elif k == 'newtab':
return 'about_newtab'
return k
# See if each entry matches up.
for k in cnt_hist:
e_key = translate_key(k)
if e_key not in cnt_events:
return (u"{} is not being recorded by events.".format(e_key), p)
if cnt_hist[k] != cnt_events[e_key]:
return ("{} doesn't match.".format(k), p)
# Make sure the sum matches up as well.
print cnt_hist, cnt_events
sum_search_counts = sum(cnt_hist.values())
sum_search_events = sum(cnt_events.values())
if sum_search_counts != sum_search_events:
return ("The sums don't match!", p)
return ("ok", p)
# In[7]:
mismatches_by_key = all_pings.map(check_counts).countByKey()
for k, v in sorted(mismatches_by_key.iteritems(), key=lambda t: -t[1]):
print(u"{} - Ratio {:.4f}".format(k, v / float(total_pings)))
# In[13]:
event_blobs = all_pings.map(lambda p: p.get("payload/processes/parent/events", {}) or {}) .filter(lambda e: len(e) > 0) .take(30)
# In[14]:
for events in event_blobs:
print ""
for e in events:
print e
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment