Skip to content

Instantly share code, notes, and snippets.

View saptarshiguha's full-sized avatar

Saptarshi Guha saptarshiguha

View GitHub Profile
import datetime
import json
import random
import subprocess
import time
import pandas as pd
random.seed(10)
sampleids = [ random.randint(1,100) for x in range(2)]
samplechar = [ "'{}'".format(str(x)) for x in sampleids]
u0 = sqlContext.read.load("s3://telemetry-parquet/main_summary/v3", "parquet",mergeSchema=True)
import pyspark
import py4j
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = pyspark.SparkContext()
sqlContext = SQLContext(sc)
print(sqlContext)
import readline
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
import rpy2.robjects as ro
import sys
import datetime
import json
import random
import subprocess
mp = sqlContext.read.load("s3://telemetry-parquet/main_summary/v3",'parquet')
mp.registerTempTable("ms")
newusers=[]
for whatdate in [('2016-04',16892,16922),('2016-05',16922,16953),('2016-06',16953,16983),('2016-07',16983,17014),('2016-08',17014,17045),('2016-09',17045,17075),('2016-10',17075,17106),('2016-11',17106,17136)]:
print(whatdate)
res2 = sqlContext.sql("""
select client_id, count(distinct( substring(subsession_start_date,1,10))) as ndayactive
from ms where app_name = 'Firefox'
and substring(subsession_start_date,1,7)=='{}'
mp = sqlContext.read.load("s3://telemetry-parquet/main_summary/v3",'parquet')
mp.registerTempTable("ms")
newusers=[]
for whatdate in [('2016-04',16892,16922),('2016-05',16922,16953),('2016-06',16953,16983),('2016-07',16983,17014),('2016-08',17014,17045),('2016-09',17045,17075),('2016-10',17075,17106),('2016-11',17106,17136)]:
print(whatdate)
res2 = sqlContext.sql("""
select client_id, count(distinct( substring(subsession_start_date,1,10))) as ndayactive
from ms where app_name = 'Firefox'
and substring(subsession_start_date,1,7)=='{}'
u0 = sqlContext.read.load("s3://telemetry-parquet/main_summary/v3", "parquet",mergeSchema=True)
u1 = u0.select(u0.client_id,
u0.sample_id, #remove when ready to do all counts
u0.app_name.alias("appname"),
u0.crash_submit_success_main.alias("crashmain"),
u0.crashes_detected_content.alias("crashcontent"),
u0.subsession_start_date.substr(1,10).alias("date"),
u0.subsession_length.alias("length"),
u0.total_uri_count.alias("uri"),
u0.unique_domains_count.alias("domain"))
mainpingspq = sqlContext.read.load("s3://telemetry-parquet/main_summary/v3", "parquet")
import datetime
DATE_1970_01_01 = datetime.datetime(1970, 1, 1)
START_profile = "2016-08-14"
END_profile = "2016-08-28"
DAYS_START_profile = (datetime.datetime.strptime(START_profile, "%Y-%m-%d") - DATE_1970_01_01).days #in days since Jan 1, 1970
DAYS_END_profile = (datetime.datetime.strptime(END_profile, "%Y-%m-%d") - DATE_1970_01_01).days #in days since Jan 1, 1970
START_ping = '2016-08-14'
END_ping = '2016-10-15'
START_s3 = '20160813'
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.