Created
October 14, 2017 16:33
-
-
Save saptarshiguha/6badf138a8c2218af361cdd8195b33f9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
```{python} | |
from pyspark.sql import SparkSession | |
import subprocess | |
import json | |
import datetime | |
import pandas as pd | |
spark = SparkSession.builder.getOrCreate() | |
sys.path.append("/home/hadoop/") | |
ms = spark.read.option("mergeSchema", "true").\ | |
parquet("s3://telemetry-parquet/main_summary/v4/") | |
ms.createOrReplaceTempView('ms') | |
import mozillametricstools.common.functions as mozfun | |
mozfun.register_udf(sqlContext | |
, lambda arr: sum(arr) if arr else 0, "array_sum" | |
, pyspark.sql.types.IntegerType()) | |
from subprocess import call | |
call(["curl", "-O","https://product-details.mozilla.org/1.0/firefox_history_major_releases.json"]) | |
with open('firefox_history_major_releases.json') as data_file: | |
x = json.load(data_file) | |
releases = [] | |
for k in x: | |
if x[k]>='2016-07-01': releases.append((k,x[k])) | |
releases = sorted(releases, key=lambda s : s[1]) | |
dau4ever = spark.sql(""" | |
select | |
from_unixtime(unix_timestamp(submission_date_s3, 'yyyyMMdd'), 'yyyy-MM-dd') as date, | |
count(distinct(client_id)) as dau, | |
sum(case | |
when search_counts is not null then array_sum(search_counts.count) else 0 | |
end) as allsearch | |
from ms | |
where app_name='Firefox' and normalized_channel = 'release' and sample_id='42' | |
and submission_date_s3>='20160701' | |
group by 1 order by 1 | |
""").toPandas() | |
def daysBetween(f,t): | |
return [(datetime.datetime.strptime(f,'%Y-%m-%d')-datetime.datetime.strptime(t,'%Y-%m-%d')).days for f,t in zip(f,t)] | |
coll=[] | |
for i,r in enumerate(releases): | |
v,d = r | |
releasedateYYMMDD=datetime.datetime.strptime(d,'%Y-%m-%d').strftime('%Y%m%d') | |
dateSinceEpoch=(datetime.datetime.strptime(d,'%Y-%m-%d')-datetime.datetime.strptime("1970-01-01",'%Y-%m-%d')).days | |
q= """select | |
from_unixtime(unix_timestamp(submission_date_s3, 'yyyyMMdd'), 'yyyy-MM-dd') as date, | |
count(distinct(client_id)) as np, | |
sum(case | |
when search_counts is not null then array_sum(search_counts.count) else 0 | |
end) as nsearch | |
from ms | |
where app_name = 'Firefox' and normalized_channel = 'release' and sample_id='42' | |
and profile_creation_date >= {dateSinceEpoch} | |
and submission_date_s3 >='{releasedateYYMMDD}' | |
group by 1 order by 1 | |
""".format(dateSinceEpoch=dateSinceEpoch,releasedateYYMMDD=releasedateYYMMDD) | |
f=spark.sql(q).toPandas() | |
f["version"] = v | |
f["releasedate"] = d | |
f['daysSince']=daysBetween(f['date'],f['releasedate']) | |
f = f.loc[f.daysSince<365] | |
f=f.merge(dau4ever,how='left',on='date') | |
f["pNewOnDAU"]=f['np']/f['dau'] | |
f["sNewOnDAU"]=f['nsearch']/f['allsearch'] | |
coll.append(f) | |
print("""{}/{}""".format(i,len(releases))) | |
y=pd.concat(coll) | |
y.to_csv("~/tmp/y.csv") | |
``` | |
```{r} | |
library(data.table) | |
library(lattice) | |
y=fread("~/tmp/y.csv") | |
y=y[order(version,daysSince),] | |
z=y[, list( meanPropDAU=mean(pNewOnDAU) | |
, lowPropDau=mean(pNewOnDAU) -1.96*sd(pNewOnDAU)/sqrt(.N) | |
, highPropDau=mean(pNewOnDAU)+1.96*sd(pNewOnDAU)/sqrt(.N) | |
, meanPropSrch=mean(sNewOnDAU) | |
, lowPropSrc=mean(sNewOnDAU) -1.96*sd(sNewOnDAU)/sqrt(.N) | |
, highPropSrc=mean(sNewOnDAU)+1.96*sd(sNewOnDAU)/sqrt(.N) | |
), by=daysSince] | |
xyplot(pNewOnDAU~ daysSince,type=c('g','l'),col='#00000030',groups=version,data=y, | |
scale=list(y=list(tick.num=20),x=list(tick.num=50,rot=45,cex=0.6)) | |
,main='DAU proportion from new users') | |
xyplot(sNewOnDAU~ daysSince,type=c('g','l'),col='#00000030',groups=version,data=y,scale=list(y=list(tick.num=20),x=list(tick.num=50,rot=45,cex=0.6)) | |
, main='Search proportion from new users') | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment