Last active
June 4, 2018 22:54
-
-
Save hibernado/8ed985a8a9f0361cbac1ab82f5e008ef to your computer and use it in GitHub Desktop.
Analysis Task
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(lubridate) | |
library(data.table) | |
getwd() | |
dir = '~/Documents/open_signal/' | |
setwd(dir) | |
file = 'speed_data.csv' | |
df <- read.csv(file,stringsAsFactors = F) | |
glimpse(df) | |
df = df %>% | |
mutate(hour_of_day = hour(hour_read_at), | |
day_of_week = wday(hour_read_at)) | |
df[df$test_type == 'speeds','test_type'] <- 'automatic' | |
df$date = as.Date(df$hour_read_at) | |
glimpse(df) | |
df %>% | |
group_by(network_name_mapped, country,date) %>% | |
summarise(mn = mean(download_speed)) %>% | |
group_by(network_name_mapped, country) %>% | |
summarise(mn_mn = mean(mn), | |
low_mn = quantile(mn,0.05), | |
upp_mn = quantile(mn,0.95)) %T>% | |
print %>% | |
qplot(data=. | |
,x = network_name_mapped | |
,y = mn_mn | |
,ymin = low_mn | |
,ymax = upp_mn | |
,geom = 'errorbar') + | |
geom_point(size=4,colour='blue') + | |
expand_limits(y=0) | |
df %>% | |
group_by(network_name_mapped, country,test_type,date) %>% | |
summarise(mn = mean(download_speed)) %>% | |
group_by(network_name_mapped, country,test_type) %>% | |
summarise(mn_mn = mean(mn), | |
low_mn = quantile(mn,0.05), | |
upp_mn = quantile(mn,0.95)) %T>% | |
print %>% | |
qplot(data=. | |
,x = network_name_mapped | |
,y = mn_mn | |
,ymin = low_mn | |
,ymax = upp_mn | |
,geom = 'errorbar') + geom_point(size=4,colour='blue') + | |
facet_wrap(~test_type) + | |
expand_limits(y=0) | |
df %>% | |
group_by(network_name_mapped, country, test_type) %>% | |
summarise(cnt = n()) %>% | |
spread(test_type,cnt) | |
df %>% | |
group_by(network_name_mapped, country,date) %>% | |
summarise(mn = mean(download_speed), | |
max = max(download_speed), | |
min = min(download_speed), | |
stddv = sqrt(var(download_speed)), | |
med = median(download_speed), | |
cnt = length(download_speed)) %T>% print %>% | |
gather(variable,value,-network_name_mapped,-country,-date) %>% | |
qplot(data=., | |
x = interaction(network_name_mapped,country), | |
y = value, | |
geom = 'boxplot' | |
) + facet_wrap(~ variable,scales = 'free_y') + | |
expand_limits(y=0) + | |
theme(axis.text.x = element_text(hjust=0, angle=-20)) | |
df %>% | |
group_by(network_name_mapped, country,date,test_type) %>% | |
summarise(mn = mean(download_speed), | |
max = max(download_speed), | |
min = min(download_speed), | |
stddv = sqrt(var(download_speed)), | |
med = median(download_speed), | |
cnt = length(download_speed)) %T>% print %>% | |
gather(variable,value,-network_name_mapped,-country,-test_type,-date) %>% | |
qplot(data=., | |
x = interaction(network_name_mapped,country), | |
y = value, | |
group = network_name_mapped, | |
colour = test_type, | |
geom = 'boxplot' | |
) + | |
facet_wrap(test_type ~ variable,scales = 'free_y',nrow=2) + | |
expand_limits(y=0) + | |
theme(axis.text.x = element_text(hjust=0, angle=-20)) | |
# Reasons: | |
- #Geography | |
- #Frequency | |
- # Time of day / day of week (seasonality) | |
# | |
{} | |
############################# | |
# Initial Approach | |
# Can we derive a benchmark for the time of day? | |
summary.1 = | |
df %>% | |
group_by(network_name_mapped, country, test_type, hour_of_day) %>% | |
summarise(stddev = sqrt(var(download_speed))) | |
filter(summary.1, country == 'USA') %>% | |
qplot(data=., | |
x = factor(hour_of_day), | |
y = stddev, | |
colour = test_type) + | |
facet_wrap(~ test_type, scales = 'free_y') + | |
facet_wrap(~ network_name_mapped) | |
glimpse(df) | |
model.1 = lm(download_speed ~ hour_of_day + day_of_week, data = df) | |
summary(model.1) | |
model.2 = lm(download_speed ~ hour_of_day + | |
day_of_week + | |
test_type + | |
network_name_mapped, data = df) | |
summary(model.2) | |
# Are some data sources obviously generating | |
# 'wild' data | |
df %>% | |
group_by(network_name_mapped, country,test_type, device_id_time) %>% | |
summarise(stddev = sqrt(var(log(download_speed)))) %>% | |
qplot(data =., | |
x = stddev) + | |
facet_wrap(test_type ~ network_name_mapped, scales = 'free_y') | |
# Could we just compare the mean for a 90% quantile | |
# 'band' against the overall mean? | |
# is the data sufficiently well behaved? | |
df = | |
df %>% | |
group_by(date, network_name_mapped) %>% | |
mutate( upp = quantile(download_speed, 0.95), | |
low = quantile(download_speed, 0.05)) %>% | |
mutate( in_band = ifelse(download_speed > low & download_speed < upp,'yes','no')) | |
df %>% | |
group_by(in_band) %>% | |
summarise(n()) | |
left_join({ | |
df %>% | |
group_by(date, network_name_mapped) %>% | |
summarise(mn = mean(download_speed), vr = var(download_speed), cnt = n()) | |
},{ | |
df %>% | |
filter(in_band == 'yes') %>% | |
group_by(date, network_name_mapped) %>% | |
summarise(in_band_mn = mean(download_speed), in_band_vr = var(download_speed)) | |
} | |
) %>% | |
filter(!is.na(in_band_mn)) %>% | |
mutate( is_anomalous = (abs(in_band_mn-mn)/abs(mn)) > 0.2) %>% | |
qplot(data=., | |
x=date, | |
y=mn, | |
group = factor(1), | |
colour = is_anomalous, | |
geom = 'point') + | |
facet_grid(is_anomalous ~ network_name_mapped) | |
# Answer: NO ! | |
# attempt two | |
df %>% glimpse | |
df %>% | |
group_by(device_id_time, test_type, network_name_mapped) %>% | |
summarise(cnt=n()) %T>% | |
summary %>% | |
qplot(data=., | |
x = log(cnt), | |
fill = network_name_mapped) + | |
facet_wrap(test_type~network_name_mapped, | |
scales = 'free_y', | |
ncol = 4) | |
df %>% | |
group_by(date, network_name_mapped) %>% | |
summarise(mn = mean(download_speed), vr = var(download_speed), cnt = n()) %>% | |
gather(variable,value,-network_name_mapped,-date) %>% | |
qplot(data=., | |
x = date, | |
y = value, | |
group = variable, | |
colour = variable, | |
geom='point') + | |
facet_wrap(variable ~ network_name_mapped, | |
scales = 'free_y', | |
ncol = 4) + | |
expand_limits(y=0) | |
# Network 1 | |
# Stable | |
# Network 2 & 4 | |
# cnt, mean & variance go nuts at a certain point in time | |
# Network 3 | |
# cnt shoots up at a certain point in time (promotional activity?) | |
# this clearly has an impact on mean & var but it's not clear that | |
# it is unexplained. | |
# I'm guessing that perhaps new geographies are being monitored | |
# or perhaps certain existing ones more intensively. | |
# Since the change is not immediate it's not obvious | |
# how to treat this gradual change. | |
# What we really want is a measure of 'speed' | |
# which is clearly understood in terms of devices & geography. | |
# | |
# This would then highlight if we have new 'fast' areas. | |
# or existing 'fast' areas which are being reported more frequently. | |
df %>% | |
group_by(date, network_name_mapped,device_id_time) %>% | |
summarise(cnt = n()) %>% | |
group_by(date, network_name_mapped) %>% | |
summarise(mn_cnt = mean(cnt), max_cnt = max(cnt), min_cnt = min(cnt)) %>% | |
gather(variable,value,-network_name_mapped,-date) %>% | |
qplot(data=., | |
x = date, | |
y = value, | |
group = variable, | |
colour = variable, | |
geom='point') + | |
facet_wrap(variable ~ network_name_mapped, | |
scales = 'free_y', | |
ncol = 4) + | |
expand_limits(y=0) | |
library(broom) | |
df.1 = | |
df %>% | |
group_by(date, network_name_mapped) %>% | |
summarise(vr = var(download_speed), mn = mean(download_speed), cnt = n(),man_cnt = sum(ifelse(test_type=='automatic',1,0))) %>% | |
gather(variable,value,-date,-network_name_mapped) %>% | |
ungroup | |
df.1 %>% | |
mutate(date_num = as.numeric(date)) %>% | |
# filter(date == max(date)) | |
group_by(network_name_mapped,variable) %>% | |
mutate(thresh = ifelse(date_num>(max(date_num)-50),1,0)) %>% | |
do( fit = lm(value~date_num+thresh,span=.5,.), data= (.) ) %>% | |
augment(fit,data) %>% | |
select(date,network_name_mapped,variable,value,fitted = .fitted,thresh) %>% | |
qplot(data=., | |
x = date, | |
y = value, | |
group=network_name_mapped, | |
geom='blank') + | |
geom_line(aes(x=date,y=fitted,group=network_name_mapped,colour=thresh)) + | |
facet_wrap(variable ~ network_name_mapped, scales = 'free_y') + | |
expand_limits(y=0) | |
dt = | |
df %>% | |
arrange(date) %>% | |
group_by(date, network_name_mapped) %>% | |
summarise(vr = var(download_speed), | |
mn = mean(download_speed), | |
cnt = n(), | |
man_vr = var(ifelse(test_type=='automatic',download_speed,NA),na.rm =T), | |
man_mn = mean(ifelse(test_type=='automatic',download_speed,NA),na.rm =T), | |
man_cnt = sum(ifelse(test_type=='automatic',1,0)) | |
) %>% | |
data.table | |
dt.2 = dt[network_name_mapped == 'Netwwork2'] | |
# Chose today = 195. | |
# It should be easy to spot problems now | |
(days = dim(dt.2)[1]) | |
day_ = 195 | |
# Split the data: | |
(prev = dt.2[1:day_-1]) | |
(curr = dt.2[day_]) | |
all = rbind(prev,curr) | |
samples = dim(all)[1] | |
all_mn = mean(all$mn) | |
prev_mn = mean(prev$mn) | |
curr_mn = mean(curr$mn) | |
# total_sum_of_squares | |
(sst = sum((all$mn-all_mn)^2)) | |
# residual_sum_of_squares | |
(ssr = sum((prev$mn - prev_mn)^2) + sum((curr$mn - curr_mn)^2)) | |
# model sum of squares | |
(ssm = sst-ssr) | |
(r_squared = ssm/ssr) | |
dgf = 1 # <- deg_free | |
mn_ssm = ssm/dgf | |
mn_ssr = ssr/(samples-dgf-1) | |
(F_ratio = mn_ssm/mn_ssr) | |
# Normal distribution (patently true) | |
# F_test => | |
p = 1-pf(F_ratio, df1=1, df2=samples-dgf,lower.tail = T) | |
p | |
mod.2 = | |
all %>% | |
mutate(is_new_unexplained_mean = ifelse(date==max(all$date),1,0)) %>% | |
lm(mn ~ is_new_unexplained_mean, data=.) | |
mod.2.summary = summary(mod.2) | |
mod.2.summary$fstatistic | |
mod.2.summary | |
# https://stackoverflow.com/questions/5587676/pull-out-p-values-and-r-squared-from-a-linear-regression | |
get_f_ratio_p <- function (modelobject) { | |
if (class(modelobject) != "lm") stop("Not an object of class 'lm' ") | |
f <- summary(modelobject)$fstatistic | |
p <- pf(f[1],f[2],f[3],lower.tail=F) | |
attributes(p) <- NULL | |
return(p) | |
} | |
# Set 'today' | |
day_indx = 195 | |
prev_days_to_check = 120 | |
r_squared_threshold = .25 | |
f_ratio_threshold = 0.001 | |
dt.filtered = dt.2[1:day_indx] | |
out_list = list() | |
for (i in 1:day_indx) { | |
dt.filtered[, is_new_unexplained_mean:= 0] | |
dt.filtered[i:day_indx, is_new_unexplained_mean:= 1] | |
print(dt.filtered) | |
mod = lm(mn ~ is_new_unexplained_mean, data=dt.filtered) | |
frp = 0; try({frp = get_f_ratio_p(mod)}); | |
rsqr = summary(mod)$r.squared | |
out_list[[i]] = data.frame(date = dt.filtered[i]$date, | |
f.ratio.p = frp, | |
r.squared = rsqr | |
) | |
} | |
wanted_measures = c('f.ratio.p', | |
'mn', | |
# 'r.squared', | |
'r.squared.loess', | |
'r.squared.slope', | |
'r.squared.accel' | |
) | |
res = | |
bind_rows(out_list) %>% | |
inner_join(dt.filtered) %>% | |
mutate(log.f.ratio.p = log(f.ratio.p)) | |
res$r.squared.loess = predict(loess(r.squared~as.numeric(date),span=.15,data = res)) | |
res$r.squared.slope = c(0,diff(res$r.squared.loess,differences = 1)) | |
res$r.squared.accel = c(0,0,diff(res$r.squared.loess,differences = 2)) | |
anomaly_date = | |
res %>% | |
slice(max(0,day_indx-prev_days_to_check):day_indx) %>% | |
filter(f.ratio.p < f_ratio_threshold) %>% | |
filter(r.squared.accel == max(r.squared.accel)) %>% | |
select(date) | |
res %>% | |
slice(2:n()) %>% | |
gather(variable,value, -date, -network_name_mapped) %>% | |
filter(variable %in% wanted_measures) %>% | |
mutate(post_anomaly = date >= unlist(anomaly_date)) %>% | |
qplot(data=., | |
x = date, | |
y = value, | |
colour = post_anomaly, | |
group = variable) + | |
facet_wrap(~ variable, | |
scales='free_y') + | |
scale_color_brewer(palette = 'Set2') + | |
theme_linedraw() | |
print(anomaly_date) | |
# todo. | |
# choosing where the rate of change in r.squared is a maximum is not the only choice we | |
# might make. | |
# we could choose the first inflection point in r.squared where the slope is positive. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
brainstorm / work in progress.