Skip to content

Instantly share code, notes, and snippets.

@georgf
Last active May 23, 2016 17:04
Show Gist options
  • Save georgf/59af52c55cea0e88d6a2e39895da296b to your computer and use it in GitHub Desktop.
Save georgf/59af52c55cea0e88d6a2e39895da296b to your computer and use it in GitHub Desktop.
profiledate re-validation
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# ### [Bug 1271391](https://bugzilla.mozilla.org/show_bug.cgi?id=1271391) - Validate Fennec Date header & creation date
# In[9]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import datetime as dt
from uuid import UUID
import re
import email.utils as eut
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history
get_ipython().magic(u'pylab inline')
# The Date header landed [2016-05-10](https://hg.mozilla.org/mozilla-central/rev/fdece96f5cf5). This is available in ``"meta/Date"``.
#
# The ping creation date field and timezone (``created`` and ``tz``) also landed 2016-05-10, bumping the core ping version to 5 - see:
# * https://hg.mozilla.org/mozilla-central/rev/cd0c3acb37e0
# * https://hg.mozilla.org/mozilla-central/rev/40958aebbb80
# In[46]:
submission_dates = ("20160524", "20160414")
pings5 = get_pings(sc,
app="Fennec",
channel="nightly",
doc_type="core",
source_version="5",
submission_date="20160521",
fraction=1.0)
pings6 = get_pings(sc,
app="Fennec",
channel="nightly",
doc_type="core",
source_version="6",
submission_date="20160521",
fraction=1.0)
# In[50]:
merged = pings5 + pings6
properties = ['meta/Date', 'meta/submissionDate', 'clientId', 'created', 'tz']
pings = get_pings_properties(merged, properties)
# In[51]:
pings.first()
# ### Validate
# In[52]:
def valid_date_header(p):
date_pattern = '^\D+, \d{1,2} \D+ \d{4,4} \d\d:\d\d:\d\d GMT\\+00:00$'
return re.match(date_pattern, p['meta/Date']) != None
valid_date_header(pings.first())
# In[54]:
def ping_check(ping):
props = {
'meta/Date': [unicode],
'meta/submissionDate': [unicode],
'clientId': [unicode],
'created': [unicode],
'tz': [int, long]
}
for k,types in props.iteritems():
if not k in ping:
return 'missing field: ' + prop
if type(ping[k]) not in types:
return 'wrong type for ' + k
if not valid_date_header(ping):
return 'invalid date header'
return ''
# In[55]:
results = pings.map(lambda p: (ping_check(p), p))
# In[57]:
results.countByKey()
# In[62]:
grouped = results.groupByKey().map(lambda t: t[1])
grouped.cache()
# In[63]:
grouped.collect()
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment