Last active
August 12, 2016 13:00
-
-
Save georgf/0ffe4f915861be180909037a7204d7b9 to your computer and use it in GitHub Desktop.
mobile repeated profile date
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Bug 1291265 - Check for repeated client counts in new_records in Fennec dashboard data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/hadoop/anaconda2/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.\n", | |
" warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Populating the interactive namespace from numpy and matplotlib\n" | |
] | |
} | |
], | |
"source": [ | |
"import matplotlib.pyplot as plt\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import plotly.plotly as py\n", | |
"\n", | |
"%pylab inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"160" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sc.defaultParallelism" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Load the mobile clients parquet file for performant analysis." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"4119561698" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dataset = sqlContext.read.load(\"s3n://net-mozaws-prod-us-west-2-pipeline-analysis/mobile/mobile_clients\", \"parquet\")\n", | |
"dataset.count()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Row(clientid=u'e8c84fb4-f260-47a9-823e-d06e3cdd1764', submissiondate=datetime.datetime(2016, 5, 16, 0, 0), creationdate=None, profiledate=datetime.datetime(2016, 3, 22, 0, 0), geocountry=u'JP', locale=u'ja-JA', os=u'Android', osversion=u'19', buildid=u'20160515030241', appversion=u'49.0a1', device=u'iNet-ADP-921', arch=u'armeabi-v7a', defaultsearch=u'google', distributionid=None, experiments=u'[\"offline-cache\",\"urlbar-show-origin-only\",\"bookmark-history-menu\",\"content-notifications-5pm\",\"urlbar-show-ev-cert-owner\",\"promote-add-to-homescreen\",\"search-term\",\"onboarding2-c\"]', channel=u'nightly', submission=u'20160516')" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dataset.rdd.first()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Filter out pings sent on d0" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"d0 = dataset.filter(\"channel = 'release'\")\\\n", | |
" .filter(\"os = 'Android'\")\\\n", | |
" .filter(\"submissiondate = profiledate\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"139685854" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"d0.count()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.034" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"round(float(d0.count()) / dataset.count(), 3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Row(clientid=u'ea677620-b51b-40e7-86dc-f91881d6426b', submissiondate=datetime.datetime(2016, 6, 5, 0, 0), creationdate=None, profiledate=datetime.datetime(2016, 6, 5, 0, 0), geocountry=u'GR', locale=u'el-GR', os=u'Android', osversion=u'22', buildid=u'20160502161457', appversion=u'46.0.1', device=u'Sony-E5303', arch=u'armeabi-v7a', defaultsearch=None, distributionid=None, experiments=u'[\"bookmark-history-menu\"]', channel=u'release', submission=u'20160605')" | |
] | |
}, | |
"execution_count": 25, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"d0.rdd.first()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Check for repeated d0 per client" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"First count on how many different days we saw clients submitting d0 pings." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"d0counts = d0.groupBy(['clientid', 'submissiondate'])\\\n", | |
" .count()\\\n", | |
" .groupBy('clientid')\\\n", | |
" .count()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[Row(clientid=u'27aa46a1-f1a8-4200-b08f-0b8a0b4f5c3e', count=1),\n", | |
" Row(clientid=u'28d21e91-8bea-4725-be72-3f599a2bcfbd', count=1),\n", | |
" Row(clientid=u'f53e3c2a-521e-46ab-b235-4b32f2c90238', count=1)]" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"d0counts.rdd.take(3)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Now, how many of these submitted d0 pings on more than one day?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"4" | |
] | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"d0counts.filter(\"count > 1\")\\\n", | |
" .count()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Check for repeated profile dates" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Ok, that is really low, that does not seem to be a problem.\n", | |
"Following up from here, how many clients do actually submit more than one profiledate?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Group clients profiledate submissions together." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"repeatCounts = dataset.filter(\"channel = 'release'\")\\\n", | |
" .filter(\"os = 'Android'\")\\\n", | |
" .groupBy(['clientid', 'profiledate'])\\\n", | |
" .count()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[Row(clientid=u'c5e9483e-83d5-4812-aaee-24cfcd29293f', profiledate=datetime.datetime(2016, 5, 18, 0, 0), count=808),\n", | |
" Row(clientid=u'934ba8d0-a183-44f8-acfc-701794e33a26', profiledate=datetime.datetime(2016, 4, 2, 0, 0), count=481),\n", | |
" Row(clientid=u'5ce8ab95-e9dc-40dc-a0b2-7927d15920a5', profiledate=datetime.datetime(2016, 6, 16, 0, 0), count=586)]" | |
] | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"repeatCounts.rdd.take(3)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Now check how many of them submitted more than one profiledate value." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"6509" | |
] | |
}, | |
"execution_count": 32, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"repeatCounts.groupBy('clientid')\\\n", | |
" .count()\\\n", | |
" .filter('count > 1')\\\n", | |
" .count()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# ### Bug 1291265 - Check for repeated client counts in new_records in Fennec dashboard data | |
# In[1]: | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import numpy as np | |
import plotly.plotly as py | |
get_ipython().magic(u'pylab inline') | |
# In[2]: | |
sc.defaultParallelism | |
# Load the mobile clients parquet file for performant analysis. | |
# In[3]: | |
dataset = sqlContext.read.load("s3n://net-mozaws-prod-us-west-2-pipeline-analysis/mobile/mobile_clients", "parquet") | |
dataset.count() | |
# In[4]: | |
dataset.rdd.first() | |
# ### Filter out pings sent on d0 | |
# In[6]: | |
d0 = dataset.filter("channel = 'release'") .filter("os = 'Android'") .filter("submissiondate = profiledate") | |
# In[10]: | |
d0.count() | |
# In[11]: | |
round(float(d0.count()) / dataset.count(), 3) | |
# In[25]: | |
d0.rdd.first() | |
# ### Check for repeated d0 per client | |
# First count on how many different days we saw clients submitting d0 pings. | |
# In[20]: | |
d0counts = d0.groupBy(['clientid', 'submissiondate']) .count() .groupBy('clientid') .count() | |
# In[21]: | |
d0counts.rdd.take(3) | |
# Now, how many of these submitted d0 pings on more than one day? | |
# In[23]: | |
d0counts.filter("count > 1") .count() | |
# ### Check for repeated profile dates | |
# Ok, that is really low, that does not seem to be a problem. | |
# Following up from here, how many clients do actually submit more than one profiledate? | |
# Group clients profiledate submissions together. | |
# In[30]: | |
repeatCounts = dataset.filter("channel = 'release'") .filter("os = 'Android'") .groupBy(['clientid', 'profiledate']) .count() | |
# In[31]: | |
repeatCounts.rdd.take(3) | |
# Now check how many of them submitted more than one profiledate value. | |
# In[32]: | |
repeatCounts.groupBy('clientid') .count() .filter('count > 1') .count() | |
# In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment