Skip to content

Instantly share code, notes, and snippets.

@saptarshiguha
Created July 18, 2016 22:26
Show Gist options
  • Select an option

  • Save saptarshiguha/2ec214aa1b5da499cc797599197373b8 to your computer and use it in GitHub Desktop.

Select an option

Save saptarshiguha/2ec214aa1b5da499cc797599197373b8 to your computer and use it in GitHub Desktop.
addonID = ["{e4a8a97b-f2ed-450b-b12d-ee082ba24781}",
"{b9bfaf1c-a63f-47cd-8b9a-29526ced9060}",
"{b9db16a4-6edc-47ec-a1f4-b86292ed211d}",
"jid1-F9UJ2thwoAm5gQ@jetpack",
"{3d7eb24f-2740-49df-8937-200b1cc08f8a}",
"{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}",
"[email protected]",
"jid1-Xo5SuA6qc1DFpw@jetpack",
"jid0-GXjLLfbCoAx0LcltEdFrEkQdQPI@jetpack"]
import mozillametricstools.common.functions as cf
import pyspark.sql.functions as fun
from pyspark.sql.window import Window
from pyspark.sql import Row
from pyspark.sql.types import BooleanType ,StringType,ArrayType
frame = sqlContext.read.load(cf.latest_longitudinal_path(),'parquet')
####################################################################################################
## Begin Analyses
####################################################################################################
## The Sample I need
## 1.Select Beta profiles and for those select sessions where there version is >=49
## (see https://bugzilla.mozilla.org/show_bug.cgi?id=1281281)
xframe = frame
s0 = xframe.filter("normalized_channel = 'beta'")
s1 = s0.select([
fun.col("client_id"),
fun.col("subprocess_crashes_with_dump").content.alias("content_crashes"),
fun.col("settings").e10s_enabled[0].alias("hasE10s"),
fun.explode(fun.col("active_addons")[0]).alias("aid","aic"),
fun.col("build").platform_version.alias("platform_version"),
fun.col("subsession_length")
]).select(["*",
fun.when(fun.col("aid").isin(addonID),1.0).otherwise(0.0).alias("correctAddon")
])
window_spec = Window.partitionBy(s1["client_id"])
isProfileWithRightAddon = fun.max(s1["correctAddon"]).over(window_spec)
s2 = s1.select(
"client_id",
"hasE10s",
isProfileWithRightAddon.alias("profileHasReqdAddons"),
"platform_version",
"content_crashes",
"subsession_length",
"aid")
s2.take(2)
## Now For a quick count before we restrict ourselves to
## profiles on version 49
oo = s2.groupBy("hasE10s","profileHasReqdAddons").count().collect()
print(oo)
## Now Filter for profiles on Version 49
s3 = s2.select("*", fun.explode("platform_version").alias("versionexp"))
s3 = s3.select("*", fun.col("versionexp").substr(1,2).alias("version2"))
s3 = s3.filter('version2 >= "43"')
## Counts by version too
deepCount = s3.groupBy("hasE10s","profileHasReqdAddons","version2").count().collect()
print(deepCount)
## Now simplify the dataframe by selecting only what we need
s3 = s3.select("*",fun.explode("content_crashes").alias("cc"))
s3 = s3.select("*",fun.explode("subsession_length").alias("slength"))
s3 = s3.select("client_id","hasE10s","profileHasReqdAddons","cc","slength")
## Not aggregating by Client_ID
s4 = s3.groupBy("hasE10s","profileHasReqdAddons").agg(
fun.countDistinct(fun.col("client_id")).alias("n"),
fun.sum(fun.col("cc")).alias("tCC"),
fun.sum(fun.col("slength")).alias("tSl") ).collect()
## But Maybe We should Aggregate By ClientID - one row per
s4 = s3.filter("slength >0").groupBy("client_id","hasE10s","profileHasReqdAddons").agg(
fun.count("cc").alias("k"),
fun.sum("cc").alias("tCC"),
fun.sum("slength").alias("tSL"))
s5 = s4.select("client_id","hasE10s","profileHasReqdAddons","k", (fun.col("tCC")/fun.col("tSL")*3600.0).alias("crashRate") , (fun.col("tSL")/fun.col("k")).alias("avSL"), (fun.col("tCC")/fun.col("k")).alias("avCC") )
s5.take(20)
s4 = s5.groupBy("hasE10s","profileHasReqdAddons").agg(
fun.countDistinct(fun.col("client_id")).alias("n"),
fun.sum(fun.col("crashRate")).alias("tCC"), ## sum of
fun.sum(fun.pow("crashRate"),2).alias("tCC2"), ## sum of Squares
fun.sum(fun.col("avSL")).alias("tavSL"), # sum of
fun.sum(fun.pow(fun.col("avSL"),2)).alias("tavSL2"), ## sum of Squares
fun.sum(fun.col("avCC")).alias("tavCC"), # sum of
fun.sum(fun.pow(fun.col("avCC"),2)).alias("tavCC2") ## sum of Squares
).collect()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment