Skip to content

Instantly share code, notes, and snippets.

@georgf
Last active May 23, 2016 17:04
Show Gist options
  • Save georgf/59af52c55cea0e88d6a2e39895da296b to your computer and use it in GitHub Desktop.
Save georgf/59af52c55cea0e88d6a2e39895da296b to your computer and use it in GitHub Desktop.
profiledate re-validation
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### [Bug 1271391](https://bugzilla.mozilla.org/show_bug.cgi?id=1271391) - Validate Fennec Date header & creation date"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"import ujson as json\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import numpy as np\n",
"import plotly.plotly as py\n",
"import datetime as dt\n",
"from uuid import UUID\n",
"import re\n",
"import email.utils as eut\n",
"\n",
"from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history\n",
"\n",
"%pylab inline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The Date header landed [2016-05-10](https://hg.mozilla.org/mozilla-central/rev/fdece96f5cf5). This is available in ``\"meta/Date\"``.\n",
"\n",
"The ping creation date field and timezone (``created`` and ``tz``) also landed 2016-05-10, bumping the core ping version to 5 - see:\n",
"* https://hg.mozilla.org/mozilla-central/rev/cd0c3acb37e0\n",
"* https://hg.mozilla.org/mozilla-central/rev/40958aebbb80"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"submission_dates = (\"20160524\", \"20160414\")\n",
"pings5 = get_pings(sc,\n",
" app=\"Fennec\",\n",
" channel=\"nightly\",\n",
" doc_type=\"core\",\n",
" source_version=\"5\",\n",
" submission_date=\"20160521\",\n",
" fraction=1.0)\n",
"pings6 = get_pings(sc,\n",
" app=\"Fennec\",\n",
" channel=\"nightly\",\n",
" doc_type=\"core\",\n",
" source_version=\"6\",\n",
" submission_date=\"20160521\",\n",
" fraction=1.0)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"merged = pings5 + pings6\n",
"properties = ['meta/Date', 'meta/submissionDate', 'clientId', 'created', 'tz']\n",
"pings = get_pings_properties(merged, properties)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'clientId': u'a7cdd71f-bee8-4268-bc25-caae17507d49',\n",
" 'created': u'2016-05-20',\n",
" 'meta/Date': u'Sat, 21 May 2016 00:03:59 GMT+00:00',\n",
" 'meta/submissionDate': u'20160521',\n",
" 'tz': -300}"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pings.first()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Validate"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def valid_date_header(p):\n",
" date_pattern = '^\\D+, \\d{1,2} \\D+ \\d{4,4} \\d\\d:\\d\\d:\\d\\d GMT\\\\+00:00$'\n",
" return re.match(date_pattern, p['meta/Date']) != None\n",
"\n",
"valid_date_header(pings.first())"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def ping_check(ping):\n",
" props = {\n",
" 'meta/Date': [unicode],\n",
" 'meta/submissionDate': [unicode],\n",
" 'clientId': [unicode],\n",
" 'created': [unicode],\n",
" 'tz': [int, long]\n",
" }\n",
"\n",
" for k,types in props.iteritems():\n",
" if not k in ping:\n",
" return 'missing field: ' + prop\n",
" if type(ping[k]) not in types:\n",
" return 'wrong type for ' + k\n",
"\n",
" if not valid_date_header(ping):\n",
" return 'invalid date header'\n",
"\n",
" return ''"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"results = pings.map(lambda p: (ping_check(p), p))"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(int, {'': 7951, 'invalid date header': 194})"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results.countByKey()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"PythonRDD[79] at RDD at PythonRDD.scala:43"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grouped = results.groupByKey().map(lambda t: t[1])\n",
"grouped.cache()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[<pyspark.resultiterable.ResultIterable at 0x7f22e40cf110>,\n",
" <pyspark.resultiterable.ResultIterable at 0x7f22e40bcfd0>]"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grouped.collect()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
# coding: utf-8
# ### [Bug 1271391](https://bugzilla.mozilla.org/show_bug.cgi?id=1271391) - Validate Fennec Date header & creation date
# In[9]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import datetime as dt
from uuid import UUID
import re
import email.utils as eut
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history
get_ipython().magic(u'pylab inline')
# The Date header landed [2016-05-10](https://hg.mozilla.org/mozilla-central/rev/fdece96f5cf5). This is available in ``"meta/Date"``.
#
# The ping creation date field and timezone (``created`` and ``tz``) also landed 2016-05-10, bumping the core ping version to 5 - see:
# * https://hg.mozilla.org/mozilla-central/rev/cd0c3acb37e0
# * https://hg.mozilla.org/mozilla-central/rev/40958aebbb80
# In[46]:
submission_dates = ("20160524", "20160414")
pings5 = get_pings(sc,
app="Fennec",
channel="nightly",
doc_type="core",
source_version="5",
submission_date="20160521",
fraction=1.0)
pings6 = get_pings(sc,
app="Fennec",
channel="nightly",
doc_type="core",
source_version="6",
submission_date="20160521",
fraction=1.0)
# In[50]:
merged = pings5 + pings6
properties = ['meta/Date', 'meta/submissionDate', 'clientId', 'created', 'tz']
pings = get_pings_properties(merged, properties)
# In[51]:
pings.first()
# ### Validate
# In[52]:
def valid_date_header(p):
date_pattern = '^\D+, \d{1,2} \D+ \d{4,4} \d\d:\d\d:\d\d GMT\\+00:00$'
return re.match(date_pattern, p['meta/Date']) != None
valid_date_header(pings.first())
# In[54]:
def ping_check(ping):
props = {
'meta/Date': [unicode],
'meta/submissionDate': [unicode],
'clientId': [unicode],
'created': [unicode],
'tz': [int, long]
}
for k,types in props.iteritems():
if not k in ping:
return 'missing field: ' + prop
if type(ping[k]) not in types:
return 'wrong type for ' + k
if not valid_date_header(ping):
return 'invalid date header'
return ''
# In[55]:
results = pings.map(lambda p: (ping_check(p), p))
# In[57]:
results.countByKey()
# In[62]:
grouped = results.groupByKey().map(lambda t: t[1])
grouped.cache()
# In[63]:
grouped.collect()
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment