Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save saptarshiguha/6badf138a8c2218af361cdd8195b33f9 to your computer and use it in GitHub Desktop.
Save saptarshiguha/6badf138a8c2218af361cdd8195b33f9 to your computer and use it in GitHub Desktop.
```{python}
from pyspark.sql import SparkSession
import subprocess
import json
import datetime
import pandas as pd
spark = SparkSession.builder.getOrCreate()
sys.path.append("/home/hadoop/")
ms = spark.read.option("mergeSchema", "true").\
parquet("s3://telemetry-parquet/main_summary/v4/")
ms.createOrReplaceTempView('ms')
import mozillametricstools.common.functions as mozfun
mozfun.register_udf(sqlContext
, lambda arr: sum(arr) if arr else 0, "array_sum"
, pyspark.sql.types.IntegerType())
from subprocess import call
call(["curl", "-O","https://product-details.mozilla.org/1.0/firefox_history_major_releases.json"])
with open('firefox_history_major_releases.json') as data_file:
x = json.load(data_file)
releases = []
for k in x:
if x[k]>='2016-07-01': releases.append((k,x[k]))
releases = sorted(releases, key=lambda s : s[1])
dau4ever = spark.sql("""
select
from_unixtime(unix_timestamp(submission_date_s3, 'yyyyMMdd'), 'yyyy-MM-dd') as date,
count(distinct(client_id)) as dau,
sum(case
when search_counts is not null then array_sum(search_counts.count) else 0
end) as allsearch
from ms
where app_name='Firefox' and normalized_channel = 'release' and sample_id='42'
and submission_date_s3>='20160701'
group by 1 order by 1
""").toPandas()
def daysBetween(f,t):
return [(datetime.datetime.strptime(f,'%Y-%m-%d')-datetime.datetime.strptime(t,'%Y-%m-%d')).days for f,t in zip(f,t)]
coll=[]
for i,r in enumerate(releases):
v,d = r
releasedateYYMMDD=datetime.datetime.strptime(d,'%Y-%m-%d').strftime('%Y%m%d')
dateSinceEpoch=(datetime.datetime.strptime(d,'%Y-%m-%d')-datetime.datetime.strptime("1970-01-01",'%Y-%m-%d')).days
q= """select
from_unixtime(unix_timestamp(submission_date_s3, 'yyyyMMdd'), 'yyyy-MM-dd') as date,
count(distinct(client_id)) as np,
sum(case
when search_counts is not null then array_sum(search_counts.count) else 0
end) as nsearch
from ms
where app_name = 'Firefox' and normalized_channel = 'release' and sample_id='42'
and profile_creation_date >= {dateSinceEpoch}
and submission_date_s3 >='{releasedateYYMMDD}'
group by 1 order by 1
""".format(dateSinceEpoch=dateSinceEpoch,releasedateYYMMDD=releasedateYYMMDD)
f=spark.sql(q).toPandas()
f["version"] = v
f["releasedate"] = d
f['daysSince']=daysBetween(f['date'],f['releasedate'])
f = f.loc[f.daysSince<365]
f=f.merge(dau4ever,how='left',on='date')
f["pNewOnDAU"]=f['np']/f['dau']
f["sNewOnDAU"]=f['nsearch']/f['allsearch']
coll.append(f)
print("""{}/{}""".format(i,len(releases)))
y=pd.concat(coll)
y.to_csv("~/tmp/y.csv")
```
```{r}
library(data.table)
library(lattice)
y=fread("~/tmp/y.csv")
y=y[order(version,daysSince),]
z=y[, list( meanPropDAU=mean(pNewOnDAU)
, lowPropDau=mean(pNewOnDAU) -1.96*sd(pNewOnDAU)/sqrt(.N)
, highPropDau=mean(pNewOnDAU)+1.96*sd(pNewOnDAU)/sqrt(.N)
, meanPropSrch=mean(sNewOnDAU)
, lowPropSrc=mean(sNewOnDAU) -1.96*sd(sNewOnDAU)/sqrt(.N)
, highPropSrc=mean(sNewOnDAU)+1.96*sd(sNewOnDAU)/sqrt(.N)
), by=daysSince]
xyplot(pNewOnDAU~ daysSince,type=c('g','l'),col='#00000030',groups=version,data=y,
scale=list(y=list(tick.num=20),x=list(tick.num=50,rot=45,cex=0.6))
,main='DAU proportion from new users')
xyplot(sNewOnDAU~ daysSince,type=c('g','l'),col='#00000030',groups=version,data=y,scale=list(y=list(tick.num=20),x=list(tick.num=50,rot=45,cex=0.6))
, main='Search proportion from new users')
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment