Last active
          May 23, 2016 17:04 
        
      - 
      
 - 
        
Save georgf/59af52c55cea0e88d6a2e39895da296b to your computer and use it in GitHub Desktop.  
    profiledate re-validation
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # coding: utf-8 | |
| # ### [Bug 1271391](https://bugzilla.mozilla.org/show_bug.cgi?id=1271391) - Validate Fennec Date header & creation date | |
| # In[9]: | |
| import ujson as json | |
| import matplotlib.pyplot as plt | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.plotly as py | |
| import datetime as dt | |
| from uuid import UUID | |
| import re | |
| import email.utils as eut | |
| from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history | |
| get_ipython().magic(u'pylab inline') | |
| # The Date header landed [2016-05-10](https://hg.mozilla.org/mozilla-central/rev/fdece96f5cf5). This is available in ``"meta/Date"``. | |
| # | |
| # The ping creation date field and timezone (``created`` and ``tz``) also landed 2016-05-10, bumping the core ping version to 5 - see: | |
| # * https://hg.mozilla.org/mozilla-central/rev/cd0c3acb37e0 | |
| # * https://hg.mozilla.org/mozilla-central/rev/40958aebbb80 | |
| # In[46]: | |
| submission_dates = ("20160524", "20160414") | |
| pings5 = get_pings(sc, | |
| app="Fennec", | |
| channel="nightly", | |
| doc_type="core", | |
| source_version="5", | |
| submission_date="20160521", | |
| fraction=1.0) | |
| pings6 = get_pings(sc, | |
| app="Fennec", | |
| channel="nightly", | |
| doc_type="core", | |
| source_version="6", | |
| submission_date="20160521", | |
| fraction=1.0) | |
| # In[50]: | |
| merged = pings5 + pings6 | |
| properties = ['meta/Date', 'meta/submissionDate', 'clientId', 'created', 'tz'] | |
| pings = get_pings_properties(merged, properties) | |
| # In[51]: | |
| pings.first() | |
| # ### Validate | |
| # In[52]: | |
| def valid_date_header(p): | |
| date_pattern = '^\D+, \d{1,2} \D+ \d{4,4} \d\d:\d\d:\d\d GMT\\+00:00$' | |
| return re.match(date_pattern, p['meta/Date']) != None | |
| valid_date_header(pings.first()) | |
| # In[54]: | |
| def ping_check(ping): | |
| props = { | |
| 'meta/Date': [unicode], | |
| 'meta/submissionDate': [unicode], | |
| 'clientId': [unicode], | |
| 'created': [unicode], | |
| 'tz': [int, long] | |
| } | |
| for k,types in props.iteritems(): | |
| if not k in ping: | |
| return 'missing field: ' + prop | |
| if type(ping[k]) not in types: | |
| return 'wrong type for ' + k | |
| if not valid_date_header(ping): | |
| return 'invalid date header' | |
| return '' | |
| # In[55]: | |
| results = pings.map(lambda p: (ping_check(p), p)) | |
| # In[57]: | |
| results.countByKey() | |
| # In[62]: | |
| grouped = results.groupByKey().map(lambda t: t[1]) | |
| grouped.cache() | |
| # In[63]: | |
| grouped.collect() | |
| # In[ ]: | |
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment