georgf · April 12, 2016 11:59
diff --git a/defaultsearch-nightly.ipynb b/defaultsearch-nightly.ipynb
diff --git a/defaultsearch-nightly.py b/defaultsearch-nightly.py

 # coding: utf-8

 # In[12]:

 ### Bug 1255458 - Validation of the Fennec release "core" ping submissions


 # Validate "core" pings sent by Firefox for Android to make sure the data they contain makes sense.

 # In[1]:

 import ujson as json
 import matplotlib.pyplot as plt
 import pandas as pd
 import numpy as np
 import plotly.plotly as py
 import datetime as dt
 from uuid import UUID

 from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history

 get_ipython().magic(u'pylab inline')


 # In[2]:

 submission_dates = ("20160329", "20160412")
 core_pings = get_pings(sc,
                        app="Fennec",
                        channel="nightly",
                        doc_type="core",
                        source_version="2",
                        submission_date=submission_dates,
                        fraction=1.0)


 # In[3]:

 pings_count = core_pings.count()
 pings_count


 # ### How many different clients are we seeing?

 # In[4]:

 one_per_client = get_one_ping_per_client(core_pings)
 num_clients = one_per_client.count()
 num_clients


 # ### Are the pings respecting our desired schema?

 # In[5]:

 def core_ping_check(p):
    # That's a sort-of schema to validate the required fields and their types.
    req_fields = {
        "v": [int],
        "clientId": [unicode],
        "seq": [int],
        "locale": [unicode],
        "os": [unicode],
        "osversion": [unicode],
        "device": [unicode],
        "arch": [unicode],
        "profileDate": [int, long],
        "defaultSearch": [unicode, None]
    }
    
    opt_fields = {
        "experiments": list,
    }
    
    # Does the ping contain all the required top-level fields?
    for k, types in req_fields.iteritems():
        if not k in p:
            return ("missing key: " + k, p)
        if type(p[k]) not in types:
            return ("wrong type: " + k, p)
    
    # Does it contain any optional field? If so, make sure it has the correct type. 
    for k, v in opt_fields.iteritems():
        if k in p:
            if type(p[k]) != v:
                return ("wrong type: " + k, p)
    
    # Perform some additional sanity checks.
    if p["v"] < 1:
        return ("check failed: ping.v < 1", p)
    if p["seq"] < 0:
        return ("check failed: ping.seq < 0", p)
    if p["profileDate"] < 0:
        return ("check failed: ping.profileDate < 0", p)
    if p["profileDate"] < 10957: # profileDates before the year 2000?
        return ("check failed: ping.profileDate < 10957", p)
    if p["profileDate"] > 17167: # profileDates after the year 2016?
        return ("check failed: ping.profileDate > 17167", p)
    if len(p["defaultSearch"]) < 1:
        return ("check failed: ping.defaultSearch length < 1")
    if len(p["defaultSearch"]) > 20:
        return ("check failed: ping.defaultSearch length > 20")
    
    # Validate the clientId.
    try:
        UUID(p["clientId"], version=4)
    except ValueError:
        return ("check failed: clientId is UUID", p)
    
    return ("", p)

 checked_pings = core_pings.map(core_ping_check)
 result_counts = checked_pings.countByKey()
 result_counts


 # So we have broken pings. Let's check examples for the types of failures:

 # In[6]:

 grouped_checked_pings = checked_pings.filter(lambda t: t[0] != '')                                     .groupByKey()                                     .collectAsMap()


 # In[7]:

 def sanitized_first(t):
    p = list(t[1])[0]
    p['clientId'] = '...'
    p['meta']['clientId'] = '...'
    p['meta']['documentId'] = '...'
    return (t[0], p)
 map(sanitized_first, grouped_checked_pings.iteritems())


 # Lets see what search engines are submitted.

 # In[8]:

 def get_engine(p):
    if not "defaultSearch" in p:
        return None
    return p["defaultSearch"]

 engines = core_pings.map(get_engine)
 engine_counts = engines.countByValue()
 engine_counts


 # Lets get percentages for that.

 # In[9]:

 [(k, round((float(v) / pings_count) * 100, 3)) for k, v in engine_counts.iteritems()]


 # ### Breakdown on defaultSearch == null

 # Find the affected clients.

 # In[22]:

 clients = core_pings.filter(lambda p: p.get("defaultSearch", None) != None)                    .map(lambda p: p["clientId"])                    .distinct()
 clients = frozenset(clients.collect())
 len(clients)


 # Group their pings by client and order the ping history.

 # In[24]:

 def get_ping_info(p):
    return {
        "clientId": p["clientId"],
        "seq": p["seq"],
        "defaultSearch": p["defaultSearch"]
    }

 def dedupe_and_sort(group):
    key, history = group
    
    seen = set()
    result = []
    
    for fragment in history:
        id = fragment["meta"]["documentId"]
        if id in seen:
            continue
            
        seen.add(id)
        result.append(get_ping_info(fragment))
    
    result.sort(key=lambda p: p["seq"])
    return result

 grouped = core_pings.groupBy(lambda x: x["clientId"])                    .filter(lambda t: t[0] in clients)                    .map(dedupe_and_sort)
 histories = grouped.collect()
 len(histories)


 # Now, lets build a frequency map of *number in ping sequence* versus *defaultSearch is null*.
 # We should normalize this on the actual ping counts in the sequences to avoid biasing.

 # In[41]:

 null_counts = {}
 counts = {}
 for h in histories:
    for i, p in enumerate(h):
        counts[i] = counts.get(i, 0) + 1
        hasDS = (("defaultSearch" in p) and ("defaultSearch" != None))
        null_counts[i] = null_counts.get(i, 0) + (1 if hasDS else 0)


 # In[45]:

 count_series = pd.Series(counts)
 null_series = pd.Series(null_counts)
 normalized = count_series - null_series
 normalized.describe()


 # So all the affected clients have only `null` values for `defaultSearch`.

	# coding: utf-8

	# In[12]:

	### Bug 1255458 - Validation of the Fennec release "core" ping submissions


	# Validate "core" pings sent by Firefox for Android to make sure the data they contain makes sense.

	# In[1]:

	import ujson as json
	import matplotlib.pyplot as plt
	import pandas as pd
	import numpy as np
	import plotly.plotly as py
	import datetime as dt
	from uuid import UUID

	from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history

	get_ipython().magic(u'pylab inline')


	# In[2]:

	submission_dates = ("20160329", "20160412")
	core_pings = get_pings(sc,
	app="Fennec",
	channel="nightly",
	doc_type="core",
	source_version="2",
	submission_date=submission_dates,
	fraction=1.0)


	# In[3]:

	pings_count = core_pings.count()
	pings_count


	# ### How many different clients are we seeing?

	# In[4]:

	one_per_client = get_one_ping_per_client(core_pings)
	num_clients = one_per_client.count()
	num_clients


	# ### Are the pings respecting our desired schema?

	# In[5]:

	def core_ping_check(p):
	# That's a sort-of schema to validate the required fields and their types.
	req_fields = {
	"v": [int],
	"clientId": [unicode],
	"seq": [int],
	"locale": [unicode],
	"os": [unicode],
	"osversion": [unicode],
	"device": [unicode],
	"arch": [unicode],
	"profileDate": [int, long],
	"defaultSearch": [unicode, None]
	}

	opt_fields = {
	"experiments": list,
	}

	# Does the ping contain all the required top-level fields?
	for k, types in req_fields.iteritems():
	if not k in p:
	return ("missing key: " + k, p)
	if type(p[k]) not in types:
	return ("wrong type: " + k, p)

	# Does it contain any optional field? If so, make sure it has the correct type.
	for k, v in opt_fields.iteritems():
	if k in p:
	if type(p[k]) != v:
	return ("wrong type: " + k, p)

	# Perform some additional sanity checks.
	if p["v"] < 1:
	return ("check failed: ping.v < 1", p)
	if p["seq"] < 0:
	return ("check failed: ping.seq < 0", p)
	if p["profileDate"] < 0:
	return ("check failed: ping.profileDate < 0", p)
	if p["profileDate"] < 10957: # profileDates before the year 2000?
	return ("check failed: ping.profileDate < 10957", p)
	if p["profileDate"] > 17167: # profileDates after the year 2016?
	return ("check failed: ping.profileDate > 17167", p)
	if len(p["defaultSearch"]) < 1:
	return ("check failed: ping.defaultSearch length < 1")
	if len(p["defaultSearch"]) > 20:
	return ("check failed: ping.defaultSearch length > 20")

	# Validate the clientId.
	try:
	UUID(p["clientId"], version=4)
	except ValueError:
	return ("check failed: clientId is UUID", p)

	return ("", p)

	checked_pings = core_pings.map(core_ping_check)
	result_counts = checked_pings.countByKey()
	result_counts


	# So we have broken pings. Let's check examples for the types of failures:

	# In[6]:

	grouped_checked_pings = checked_pings.filter(lambda t: t[0] != '') .groupByKey() .collectAsMap()


	# In[7]:

	def sanitized_first(t):
	p = list(t[1])[0]
	p['clientId'] = '...'
	p['meta']['clientId'] = '...'
	p['meta']['documentId'] = '...'
	return (t[0], p)
	map(sanitized_first, grouped_checked_pings.iteritems())


	# Lets see what search engines are submitted.

	# In[8]:

	def get_engine(p):
	if not "defaultSearch" in p:
	return None
	return p["defaultSearch"]

	engines = core_pings.map(get_engine)
	engine_counts = engines.countByValue()
	engine_counts


	# Lets get percentages for that.

	# In[9]:

	[(k, round((float(v) / pings_count) * 100, 3)) for k, v in engine_counts.iteritems()]


	# ### Breakdown on defaultSearch == null

	# Find the affected clients.

	# In[22]:

	clients = core_pings.filter(lambda p: p.get("defaultSearch", None) != None) .map(lambda p: p["clientId"]) .distinct()
	clients = frozenset(clients.collect())
	len(clients)


	# Group their pings by client and order the ping history.

	# In[24]:

	def get_ping_info(p):
	return {
	"clientId": p["clientId"],
	"seq": p["seq"],
	"defaultSearch": p["defaultSearch"]
	}

	def dedupe_and_sort(group):
	key, history = group

	seen = set()
	result = []

	for fragment in history:
	id = fragment["meta"]["documentId"]
	if id in seen:
	continue

	seen.add(id)
	result.append(get_ping_info(fragment))

	result.sort(key=lambda p: p["seq"])
	return result

	grouped = core_pings.groupBy(lambda x: x["clientId"]) .filter(lambda t: t[0] in clients) .map(dedupe_and_sort)
	histories = grouped.collect()
	len(histories)


	# Now, lets build a frequency map of number in ping sequence versus defaultSearch is null.
	# We should normalize this on the actual ping counts in the sequences to avoid biasing.

	# In[41]:

	null_counts = {}
	counts = {}
	for h in histories:
	for i, p in enumerate(h):
	counts[i] = counts.get(i, 0) + 1
	hasDS = (("defaultSearch" in p) and ("defaultSearch" != None))
	null_counts[i] = null_counts.get(i, 0) + (1 if hasDS else 0)


	# In[45]:

	count_series = pd.Series(counts)
	null_series = pd.Series(null_counts)
	normalized = count_series - null_series
	normalized.describe()


	# So all the affected clients have only `null` values for `defaultSearch`.