Last active
April 12, 2016 11:59
-
-
Save georgf/7f3925fc68e65e2402c80eb7e5db78c8 to your computer and use it in GitHub Desktop.
defaultsearch-nightly
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[12]: | |
### Bug 1255458 - Validation of the Fennec release "core" ping submissions | |
# Validate "core" pings sent by Firefox for Android to make sure the data they contain makes sense. | |
# In[1]: | |
import ujson as json | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import numpy as np | |
import plotly.plotly as py | |
import datetime as dt | |
from uuid import UUID | |
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history | |
get_ipython().magic(u'pylab inline') | |
# In[2]: | |
submission_dates = ("20160329", "20160412") | |
core_pings = get_pings(sc, | |
app="Fennec", | |
channel="nightly", | |
doc_type="core", | |
source_version="2", | |
submission_date=submission_dates, | |
fraction=1.0) | |
# In[3]: | |
pings_count = core_pings.count() | |
pings_count | |
# ### How many different clients are we seeing? | |
# In[4]: | |
one_per_client = get_one_ping_per_client(core_pings) | |
num_clients = one_per_client.count() | |
num_clients | |
# ### Are the pings respecting our desired schema? | |
# In[5]: | |
def core_ping_check(p): | |
# That's a sort-of schema to validate the required fields and their types. | |
req_fields = { | |
"v": [int], | |
"clientId": [unicode], | |
"seq": [int], | |
"locale": [unicode], | |
"os": [unicode], | |
"osversion": [unicode], | |
"device": [unicode], | |
"arch": [unicode], | |
"profileDate": [int, long], | |
"defaultSearch": [unicode, None] | |
} | |
opt_fields = { | |
"experiments": list, | |
} | |
# Does the ping contain all the required top-level fields? | |
for k, types in req_fields.iteritems(): | |
if not k in p: | |
return ("missing key: " + k, p) | |
if type(p[k]) not in types: | |
return ("wrong type: " + k, p) | |
# Does it contain any optional field? If so, make sure it has the correct type. | |
for k, v in opt_fields.iteritems(): | |
if k in p: | |
if type(p[k]) != v: | |
return ("wrong type: " + k, p) | |
# Perform some additional sanity checks. | |
if p["v"] < 1: | |
return ("check failed: ping.v < 1", p) | |
if p["seq"] < 0: | |
return ("check failed: ping.seq < 0", p) | |
if p["profileDate"] < 0: | |
return ("check failed: ping.profileDate < 0", p) | |
if p["profileDate"] < 10957: # profileDates before the year 2000? | |
return ("check failed: ping.profileDate < 10957", p) | |
if p["profileDate"] > 17167: # profileDates after the year 2016? | |
return ("check failed: ping.profileDate > 17167", p) | |
if len(p["defaultSearch"]) < 1: | |
return ("check failed: ping.defaultSearch length < 1") | |
if len(p["defaultSearch"]) > 20: | |
return ("check failed: ping.defaultSearch length > 20") | |
# Validate the clientId. | |
try: | |
UUID(p["clientId"], version=4) | |
except ValueError: | |
return ("check failed: clientId is UUID", p) | |
return ("", p) | |
checked_pings = core_pings.map(core_ping_check) | |
result_counts = checked_pings.countByKey() | |
result_counts | |
# So we have broken pings. Let's check examples for the types of failures: | |
# In[6]: | |
grouped_checked_pings = checked_pings.filter(lambda t: t[0] != '') .groupByKey() .collectAsMap() | |
# In[7]: | |
def sanitized_first(t): | |
p = list(t[1])[0] | |
p['clientId'] = '...' | |
p['meta']['clientId'] = '...' | |
p['meta']['documentId'] = '...' | |
return (t[0], p) | |
map(sanitized_first, grouped_checked_pings.iteritems()) | |
# Lets see what search engines are submitted. | |
# In[8]: | |
def get_engine(p): | |
if not "defaultSearch" in p: | |
return None | |
return p["defaultSearch"] | |
engines = core_pings.map(get_engine) | |
engine_counts = engines.countByValue() | |
engine_counts | |
# Lets get percentages for that. | |
# In[9]: | |
[(k, round((float(v) / pings_count) * 100, 3)) for k, v in engine_counts.iteritems()] | |
# ### Breakdown on defaultSearch == null | |
# Find the affected clients. | |
# In[22]: | |
clients = core_pings.filter(lambda p: p.get("defaultSearch", None) != None) .map(lambda p: p["clientId"]) .distinct() | |
clients = frozenset(clients.collect()) | |
len(clients) | |
# Group their pings by client and order the ping history. | |
# In[24]: | |
def get_ping_info(p): | |
return { | |
"clientId": p["clientId"], | |
"seq": p["seq"], | |
"defaultSearch": p["defaultSearch"] | |
} | |
def dedupe_and_sort(group): | |
key, history = group | |
seen = set() | |
result = [] | |
for fragment in history: | |
id = fragment["meta"]["documentId"] | |
if id in seen: | |
continue | |
seen.add(id) | |
result.append(get_ping_info(fragment)) | |
result.sort(key=lambda p: p["seq"]) | |
return result | |
grouped = core_pings.groupBy(lambda x: x["clientId"]) .filter(lambda t: t[0] in clients) .map(dedupe_and_sort) | |
histories = grouped.collect() | |
len(histories) | |
# Now, lets build a frequency map of *number in ping sequence* versus *defaultSearch is null*. | |
# We should normalize this on the actual ping counts in the sequences to avoid biasing. | |
# In[41]: | |
null_counts = {} | |
counts = {} | |
for h in histories: | |
for i, p in enumerate(h): | |
counts[i] = counts.get(i, 0) + 1 | |
hasDS = (("defaultSearch" in p) and ("defaultSearch" != None)) | |
null_counts[i] = null_counts.get(i, 0) + (1 if hasDS else 0) | |
# In[45]: | |
count_series = pd.Series(counts) | |
null_series = pd.Series(null_counts) | |
normalized = count_series - null_series | |
normalized.describe() | |
# So all the affected clients have only `null` values for `defaultSearch`. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment