Skip to content

Instantly share code, notes, and snippets.

@georgf
Last active April 12, 2016 11:59
Show Gist options
  • Save georgf/7f3925fc68e65e2402c80eb7e5db78c8 to your computer and use it in GitHub Desktop.
Save georgf/7f3925fc68e65e2402c80eb7e5db78c8 to your computer and use it in GitHub Desktop.
defaultsearch-nightly
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# In[12]:
### Bug 1255458 - Validation of the Fennec release "core" ping submissions
# Validate "core" pings sent by Firefox for Android to make sure the data they contain makes sense.
# In[1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import datetime as dt
from uuid import UUID
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history
get_ipython().magic(u'pylab inline')
# In[2]:
submission_dates = ("20160329", "20160412")
core_pings = get_pings(sc,
app="Fennec",
channel="nightly",
doc_type="core",
source_version="2",
submission_date=submission_dates,
fraction=1.0)
# In[3]:
pings_count = core_pings.count()
pings_count
# ### How many different clients are we seeing?
# In[4]:
one_per_client = get_one_ping_per_client(core_pings)
num_clients = one_per_client.count()
num_clients
# ### Are the pings respecting our desired schema?
# In[5]:
def core_ping_check(p):
# That's a sort-of schema to validate the required fields and their types.
req_fields = {
"v": [int],
"clientId": [unicode],
"seq": [int],
"locale": [unicode],
"os": [unicode],
"osversion": [unicode],
"device": [unicode],
"arch": [unicode],
"profileDate": [int, long],
"defaultSearch": [unicode, None]
}
opt_fields = {
"experiments": list,
}
# Does the ping contain all the required top-level fields?
for k, types in req_fields.iteritems():
if not k in p:
return ("missing key: " + k, p)
if type(p[k]) not in types:
return ("wrong type: " + k, p)
# Does it contain any optional field? If so, make sure it has the correct type.
for k, v in opt_fields.iteritems():
if k in p:
if type(p[k]) != v:
return ("wrong type: " + k, p)
# Perform some additional sanity checks.
if p["v"] < 1:
return ("check failed: ping.v < 1", p)
if p["seq"] < 0:
return ("check failed: ping.seq < 0", p)
if p["profileDate"] < 0:
return ("check failed: ping.profileDate < 0", p)
if p["profileDate"] < 10957: # profileDates before the year 2000?
return ("check failed: ping.profileDate < 10957", p)
if p["profileDate"] > 17167: # profileDates after the year 2016?
return ("check failed: ping.profileDate > 17167", p)
if len(p["defaultSearch"]) < 1:
return ("check failed: ping.defaultSearch length < 1")
if len(p["defaultSearch"]) > 20:
return ("check failed: ping.defaultSearch length > 20")
# Validate the clientId.
try:
UUID(p["clientId"], version=4)
except ValueError:
return ("check failed: clientId is UUID", p)
return ("", p)
checked_pings = core_pings.map(core_ping_check)
result_counts = checked_pings.countByKey()
result_counts
# So we have broken pings. Let's check examples for the types of failures:
# In[6]:
grouped_checked_pings = checked_pings.filter(lambda t: t[0] != '') .groupByKey() .collectAsMap()
# In[7]:
def sanitized_first(t):
p = list(t[1])[0]
p['clientId'] = '...'
p['meta']['clientId'] = '...'
p['meta']['documentId'] = '...'
return (t[0], p)
map(sanitized_first, grouped_checked_pings.iteritems())
# Lets see what search engines are submitted.
# In[8]:
def get_engine(p):
if not "defaultSearch" in p:
return None
return p["defaultSearch"]
engines = core_pings.map(get_engine)
engine_counts = engines.countByValue()
engine_counts
# Lets get percentages for that.
# In[9]:
[(k, round((float(v) / pings_count) * 100, 3)) for k, v in engine_counts.iteritems()]
# ### Breakdown on defaultSearch == null
# Find the affected clients.
# In[22]:
clients = core_pings.filter(lambda p: p.get("defaultSearch", None) != None) .map(lambda p: p["clientId"]) .distinct()
clients = frozenset(clients.collect())
len(clients)
# Group their pings by client and order the ping history.
# In[24]:
def get_ping_info(p):
return {
"clientId": p["clientId"],
"seq": p["seq"],
"defaultSearch": p["defaultSearch"]
}
def dedupe_and_sort(group):
key, history = group
seen = set()
result = []
for fragment in history:
id = fragment["meta"]["documentId"]
if id in seen:
continue
seen.add(id)
result.append(get_ping_info(fragment))
result.sort(key=lambda p: p["seq"])
return result
grouped = core_pings.groupBy(lambda x: x["clientId"]) .filter(lambda t: t[0] in clients) .map(dedupe_and_sort)
histories = grouped.collect()
len(histories)
# Now, lets build a frequency map of *number in ping sequence* versus *defaultSearch is null*.
# We should normalize this on the actual ping counts in the sequences to avoid biasing.
# In[41]:
null_counts = {}
counts = {}
for h in histories:
for i, p in enumerate(h):
counts[i] = counts.get(i, 0) + 1
hasDS = (("defaultSearch" in p) and ("defaultSearch" != None))
null_counts[i] = null_counts.get(i, 0) + (1 if hasDS else 0)
# In[45]:
count_series = pd.Series(counts)
null_series = pd.Series(null_counts)
normalized = count_series - null_series
normalized.describe()
# So all the affected clients have only `null` values for `defaultSearch`.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment