Created
July 18, 2016 22:26
-
-
Save saptarshiguha/2ec214aa1b5da499cc797599197373b8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| addonID = ["{e4a8a97b-f2ed-450b-b12d-ee082ba24781}", | |
| "{b9bfaf1c-a63f-47cd-8b9a-29526ced9060}", | |
| "{b9db16a4-6edc-47ec-a1f4-b86292ed211d}", | |
| "jid1-F9UJ2thwoAm5gQ@jetpack", | |
| "{3d7eb24f-2740-49df-8937-200b1cc08f8a}", | |
| "{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}", | |
| "[email protected]", | |
| "jid1-Xo5SuA6qc1DFpw@jetpack", | |
| "jid0-GXjLLfbCoAx0LcltEdFrEkQdQPI@jetpack"] | |
| import mozillametricstools.common.functions as cf | |
| import pyspark.sql.functions as fun | |
| from pyspark.sql.window import Window | |
| from pyspark.sql import Row | |
| from pyspark.sql.types import BooleanType ,StringType,ArrayType | |
| frame = sqlContext.read.load(cf.latest_longitudinal_path(),'parquet') | |
| #################################################################################################### | |
| ## Begin Analyses | |
| #################################################################################################### | |
| ## The Sample I need | |
| ## 1.Select Beta profiles and for those select sessions where there version is >=49 | |
| ## (see https://bugzilla.mozilla.org/show_bug.cgi?id=1281281) | |
| xframe = frame | |
| s0 = xframe.filter("normalized_channel = 'beta'") | |
| s1 = s0.select([ | |
| fun.col("client_id"), | |
| fun.col("subprocess_crashes_with_dump").content.alias("content_crashes"), | |
| fun.col("settings").e10s_enabled[0].alias("hasE10s"), | |
| fun.explode(fun.col("active_addons")[0]).alias("aid","aic"), | |
| fun.col("build").platform_version.alias("platform_version"), | |
| fun.col("subsession_length") | |
| ]).select(["*", | |
| fun.when(fun.col("aid").isin(addonID),1.0).otherwise(0.0).alias("correctAddon") | |
| ]) | |
| window_spec = Window.partitionBy(s1["client_id"]) | |
| isProfileWithRightAddon = fun.max(s1["correctAddon"]).over(window_spec) | |
| s2 = s1.select( | |
| "client_id", | |
| "hasE10s", | |
| isProfileWithRightAddon.alias("profileHasReqdAddons"), | |
| "platform_version", | |
| "content_crashes", | |
| "subsession_length", | |
| "aid") | |
| s2.take(2) | |
| ## Now For a quick count before we restrict ourselves to | |
| ## profiles on version 49 | |
| oo = s2.groupBy("hasE10s","profileHasReqdAddons").count().collect() | |
| print(oo) | |
| ## Now Filter for profiles on Version 49 | |
| s3 = s2.select("*", fun.explode("platform_version").alias("versionexp")) | |
| s3 = s3.select("*", fun.col("versionexp").substr(1,2).alias("version2")) | |
| s3 = s3.filter('version2 >= "43"') | |
| ## Counts by version too | |
| deepCount = s3.groupBy("hasE10s","profileHasReqdAddons","version2").count().collect() | |
| print(deepCount) | |
| ## Now simplify the dataframe by selecting only what we need | |
| s3 = s3.select("*",fun.explode("content_crashes").alias("cc")) | |
| s3 = s3.select("*",fun.explode("subsession_length").alias("slength")) | |
| s3 = s3.select("client_id","hasE10s","profileHasReqdAddons","cc","slength") | |
| ## Not aggregating by Client_ID | |
| s4 = s3.groupBy("hasE10s","profileHasReqdAddons").agg( | |
| fun.countDistinct(fun.col("client_id")).alias("n"), | |
| fun.sum(fun.col("cc")).alias("tCC"), | |
| fun.sum(fun.col("slength")).alias("tSl") ).collect() | |
| ## But Maybe We should Aggregate By ClientID - one row per | |
| s4 = s3.filter("slength >0").groupBy("client_id","hasE10s","profileHasReqdAddons").agg( | |
| fun.count("cc").alias("k"), | |
| fun.sum("cc").alias("tCC"), | |
| fun.sum("slength").alias("tSL")) | |
| s5 = s4.select("client_id","hasE10s","profileHasReqdAddons","k", (fun.col("tCC")/fun.col("tSL")*3600.0).alias("crashRate") , (fun.col("tSL")/fun.col("k")).alias("avSL"), (fun.col("tCC")/fun.col("k")).alias("avCC") ) | |
| s5.take(20) | |
| s4 = s5.groupBy("hasE10s","profileHasReqdAddons").agg( | |
| fun.countDistinct(fun.col("client_id")).alias("n"), | |
| fun.sum(fun.col("crashRate")).alias("tCC"), ## sum of | |
| fun.sum(fun.pow("crashRate"),2).alias("tCC2"), ## sum of Squares | |
| fun.sum(fun.col("avSL")).alias("tavSL"), # sum of | |
| fun.sum(fun.pow(fun.col("avSL"),2)).alias("tavSL2"), ## sum of Squares | |
| fun.sum(fun.col("avCC")).alias("tavCC"), # sum of | |
| fun.sum(fun.pow(fun.col("avCC"),2)).alias("tavCC2") ## sum of Squares | |
| ).collect() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment