hibernado · June 4, 2018 22:54 · hibernado · Jun 4, 2018
diff --git a/01_analysis.R b/01_analysis.R
 library(tidyverse)
 library(lubridate)
 library(data.table)

 getwd()
 dir = '~/Documents/open_signal/'
 setwd(dir)

 file = 'speed_data.csv'
 df <- read.csv(file,stringsAsFactors = F)


 glimpse(df)
 df = df %>% 
  mutate(hour_of_day = hour(hour_read_at),
         day_of_week = wday(hour_read_at)) 
 df[df$test_type == 'speeds','test_type'] <- 'automatic'
 df$date = as.Date(df$hour_read_at)

 glimpse(df)

 df %>%
  group_by(network_name_mapped, country,date) %>%
  summarise(mn = mean(download_speed)) %>% 
  group_by(network_name_mapped, country) %>%
  summarise(mn_mn = mean(mn),
            low_mn = quantile(mn,0.05),
            upp_mn = quantile(mn,0.95)) %T>% 
  print %>%
  qplot(data=.
        ,x = network_name_mapped
        ,y = mn_mn
        ,ymin = low_mn
        ,ymax = upp_mn
        ,geom = 'errorbar') + 
  geom_point(size=4,colour='blue') + 
  expand_limits(y=0)


 df %>%
  group_by(network_name_mapped, country,test_type,date) %>%
  summarise(mn = mean(download_speed)) %>% 
  group_by(network_name_mapped, country,test_type) %>%
  summarise(mn_mn = mean(mn),
            low_mn = quantile(mn,0.05),
            upp_mn = quantile(mn,0.95)) %T>% 
  print %>%
  qplot(data=.
        ,x = network_name_mapped
        ,y = mn_mn
        ,ymin = low_mn
        ,ymax = upp_mn
        ,geom = 'errorbar') + geom_point(size=4,colour='blue') + 
  facet_wrap(~test_type) +
  expand_limits(y=0)


 df %>% 
  group_by(network_name_mapped, country, test_type) %>%
  summarise(cnt = n()) %>%
  spread(test_type,cnt)


 df %>%
  group_by(network_name_mapped, country,date) %>%
  summarise(mn = mean(download_speed),
            max = max(download_speed),
            min = min(download_speed),
            stddv = sqrt(var(download_speed)),
            med = median(download_speed),
            cnt = length(download_speed)) %T>% print %>%
  gather(variable,value,-network_name_mapped,-country,-date) %>%
  qplot(data=.,
        x = interaction(network_name_mapped,country),
        y = value,
        geom = 'boxplot'
        ) + facet_wrap(~ variable,scales = 'free_y') + 
  expand_limits(y=0) +
  theme(axis.text.x = element_text(hjust=0, angle=-20))
  

 df %>%
  group_by(network_name_mapped, country,date,test_type) %>%
  summarise(mn = mean(download_speed),
            max = max(download_speed),
            min = min(download_speed),
            stddv = sqrt(var(download_speed)),
            med = median(download_speed),
            cnt = length(download_speed)) %T>% print %>%
  gather(variable,value,-network_name_mapped,-country,-test_type,-date) %>%
  qplot(data=.,
        x = interaction(network_name_mapped,country),
        y = value,
        group = network_name_mapped,
        colour = test_type,
        geom = 'boxplot'
  ) + 
  facet_wrap(test_type ~ variable,scales = 'free_y',nrow=2) + 
  expand_limits(y=0) +
  theme(axis.text.x = element_text(hjust=0, angle=-20))



  
  
 # Reasons:
 - #Geography 
 - #Frequency
 - # Time of day / day of week (seasonality)
 #
 {}

 #############################
 # Initial Approach
 # Can we derive a benchmark for the time of day?
 summary.1 =
  df %>% 
  group_by(network_name_mapped, country, test_type, hour_of_day) %>%
  summarise(stddev = sqrt(var(download_speed)))

 filter(summary.1, country == 'USA') %>%
  qplot(data=.,
        x = factor(hour_of_day),
        y = stddev,
        colour = test_type) +  
  facet_wrap(~ test_type, scales = 'free_y') + 
  facet_wrap(~ network_name_mapped)

 glimpse(df)
 model.1 = lm(download_speed ~ hour_of_day + day_of_week, data = df)
 summary(model.1)

 model.2 = lm(download_speed ~ hour_of_day + 
               day_of_week + 
               test_type + 
               network_name_mapped, data = df)
 summary(model.2)


 # Are some data sources obviously generating 
 # 'wild' data
 df %>% 
  group_by(network_name_mapped, country,test_type, device_id_time) %>%
  summarise(stddev = sqrt(var(log(download_speed)))) %>%
  qplot(data =.,
        x = stddev) + 
  facet_wrap(test_type ~ network_name_mapped, scales = 'free_y')

 # Could we just compare the mean for a 90% quantile 
 # 'band' against the overall mean? 
 # is the data sufficiently well behaved? 
 df = 
  df %>% 
  group_by(date, network_name_mapped) %>%
    mutate( upp = quantile(download_speed, 0.95),
            low = quantile(download_speed, 0.05)) %>%
    mutate( in_band = ifelse(download_speed > low & download_speed < upp,'yes','no'))
  
 df %>% 
  group_by(in_band) %>%
  summarise(n())


 left_join({ 
  df %>% 
  group_by(date, network_name_mapped) %>%  
  summarise(mn = mean(download_speed), vr = var(download_speed), cnt = n()) 
 },{
  df %>%  
  filter(in_band == 'yes') %>%
  group_by(date, network_name_mapped) %>%  
  summarise(in_band_mn = mean(download_speed), in_band_vr = var(download_speed)) 
 }
 ) %>%
  filter(!is.na(in_band_mn)) %>%
  mutate( is_anomalous = (abs(in_band_mn-mn)/abs(mn)) > 0.2) %>%
  qplot(data=.,
        x=date,
        y=mn,
        group = factor(1),
        colour = is_anomalous,
        geom = 'point') + 
  facet_grid(is_anomalous ~ network_name_mapped) 
  
 # Answer: NO ! 


 # attempt two

 df %>% glimpse

 df %>% 
  group_by(device_id_time, test_type, network_name_mapped) %>%
  summarise(cnt=n()) %T>%
  summary %>%
  qplot(data=.,
        x = log(cnt),
        fill = network_name_mapped) + 
  facet_wrap(test_type~network_name_mapped,
             scales = 'free_y',
             ncol = 4)

 df %>% 
  group_by(date, network_name_mapped) %>%  
  summarise(mn = mean(download_speed), vr = var(download_speed), cnt = n()) %>%
  gather(variable,value,-network_name_mapped,-date) %>%
  qplot(data=.,
        x = date, 
        y = value,
        group = variable,
        colour = variable, 
        geom='point') + 
  facet_wrap(variable ~ network_name_mapped, 
             scales = 'free_y',
             ncol = 4) + 
    expand_limits(y=0)

 # Network 1 
 # Stable

 # Network 2 & 4
 # cnt, mean & variance go nuts at a certain point in time

 # Network 3
 # cnt shoots up at a certain point in time (promotional activity?)
 # this clearly has an impact on mean & var but it's not clear that 
 # it is unexplained. 
 # I'm guessing that perhaps new geographies are being monitored 
 # or perhaps certain existing ones more intensively.
 # Since the change is not immediate it's not obvious 
 # how to treat this gradual change. 
 # What we really want is a measure of 'speed'
 # which is clearly understood in terms of devices & geography.
 #
 # This would then highlight if we have new 'fast' areas.
 # or existing 'fast' areas which are being reported more frequently.


 df %>% 
  group_by(date, network_name_mapped,device_id_time) %>%  
  summarise(cnt = n()) %>%
  group_by(date, network_name_mapped) %>%  
  summarise(mn_cnt = mean(cnt), max_cnt = max(cnt), min_cnt = min(cnt)) %>%
  gather(variable,value,-network_name_mapped,-date) %>%
  qplot(data=.,
        x = date, 
        y = value,
        group = variable,
        colour = variable, 
        geom='point') + 
  facet_wrap(variable ~ network_name_mapped, 
             scales = 'free_y',
             ncol = 4) + 
  expand_limits(y=0)



 library(broom)

 df.1 = 
  df %>% 
  group_by(date, network_name_mapped) %>%  
  summarise(vr = var(download_speed), mn = mean(download_speed), cnt = n(),man_cnt = sum(ifelse(test_type=='automatic',1,0))) %>%
  gather(variable,value,-date,-network_name_mapped) %>%
  ungroup


 df.1 %>%
  mutate(date_num = as.numeric(date)) %>% 
  # filter(date == max(date))
  group_by(network_name_mapped,variable) %>% 
  mutate(thresh = ifelse(date_num>(max(date_num)-50),1,0)) %>%
  do( fit = lm(value~date_num+thresh,span=.5,.), data= (.) ) %>%
  augment(fit,data) %>% 
  select(date,network_name_mapped,variable,value,fitted = .fitted,thresh) %>%
  qplot(data=.,
        x = date, 
        y = value,
        group=network_name_mapped,
        geom='blank') + 
  geom_line(aes(x=date,y=fitted,group=network_name_mapped,colour=thresh)) +
  facet_wrap(variable ~ network_name_mapped, scales = 'free_y') + 
  expand_limits(y=0)



 dt =
  df %>% 
  arrange(date) %>%
  group_by(date, network_name_mapped) %>%  
  summarise(vr = var(download_speed), 
            mn = mean(download_speed), 
            cnt = n(),
            man_vr = var(ifelse(test_type=='automatic',download_speed,NA),na.rm =T),
            man_mn = mean(ifelse(test_type=='automatic',download_speed,NA),na.rm =T),            
            man_cnt = sum(ifelse(test_type=='automatic',1,0))            
  ) %>%
  data.table


 dt.2 = dt[network_name_mapped == 'Netwwork2']

 # Chose today = 195. 
 # It should be easy to spot problems now
 (days = dim(dt.2)[1])
 day_ = 195

 # Split the data:
 (prev = dt.2[1:day_-1])
 (curr = dt.2[day_])
 all = rbind(prev,curr)

 samples = dim(all)[1]

 all_mn  = mean(all$mn)
 prev_mn = mean(prev$mn)
 curr_mn = mean(curr$mn)


 # total_sum_of_squares
 (sst = sum((all$mn-all_mn)^2))

 # residual_sum_of_squares
 (ssr = sum((prev$mn - prev_mn)^2) + sum((curr$mn - curr_mn)^2))

 # model sum of squares
 (ssm = sst-ssr)

 (r_squared = ssm/ssr)

 dgf = 1 # <- deg_free
 mn_ssm = ssm/dgf 
 mn_ssr = ssr/(samples-dgf-1) 
 (F_ratio = mn_ssm/mn_ssr)
 # Normal distribution (patently true)
 # F_test =>
 p = 1-pf(F_ratio, df1=1, df2=samples-dgf,lower.tail = T) 
 p

 mod.2 =
  all %>%
  mutate(is_new_unexplained_mean = ifelse(date==max(all$date),1,0)) %>%
  lm(mn ~ is_new_unexplained_mean, data=.)
 mod.2.summary = summary(mod.2)
 mod.2.summary$fstatistic
 mod.2.summary


 # https://stackoverflow.com/questions/5587676/pull-out-p-values-and-r-squared-from-a-linear-regression
 get_f_ratio_p <- function (modelobject) {
  if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
  f <- summary(modelobject)$fstatistic
  p <- pf(f[1],f[2],f[3],lower.tail=F)
  attributes(p) <- NULL
  return(p)
 }

 # Set 'today'
 day_indx = 195

 prev_days_to_check = 120
 r_squared_threshold = .25
 f_ratio_threshold = 0.001

 dt.filtered = dt.2[1:day_indx]

 out_list = list()
 for (i in 1:day_indx) {
  dt.filtered[, is_new_unexplained_mean:= 0]
  dt.filtered[i:day_indx, is_new_unexplained_mean:= 1]
  print(dt.filtered)
  mod = lm(mn ~ is_new_unexplained_mean, data=dt.filtered)
  
  frp = 0; try({frp = get_f_ratio_p(mod)});
  rsqr = summary(mod)$r.squared
  
  out_list[[i]] = data.frame(date = dt.filtered[i]$date, 
                             f.ratio.p = frp,
                             r.squared = rsqr
                             )
 }


 wanted_measures = c('f.ratio.p',
                    'mn',
                    # 'r.squared',
                    'r.squared.loess',
                    'r.squared.slope',
                    'r.squared.accel'
                    )

 res = 
  bind_rows(out_list) %>% 
  inner_join(dt.filtered) %>%
  mutate(log.f.ratio.p = log(f.ratio.p)) 


 res$r.squared.loess = predict(loess(r.squared~as.numeric(date),span=.15,data = res))
 res$r.squared.slope = c(0,diff(res$r.squared.loess,differences = 1))
 res$r.squared.accel = c(0,0,diff(res$r.squared.loess,differences = 2))

 anomaly_date =
  res %>%
  slice(max(0,day_indx-prev_days_to_check):day_indx) %>%
  filter(f.ratio.p < f_ratio_threshold) %>% 
  filter(r.squared.accel == max(r.squared.accel)) %>%
  select(date) 


 res %>%
  slice(2:n()) %>% 
  gather(variable,value, -date, -network_name_mapped) %>%
  filter(variable %in% wanted_measures) %>%
  mutate(post_anomaly = date >= unlist(anomaly_date)) %>%
  qplot(data=.,
        x = date,
        y = value,
        colour = post_anomaly,
        group = variable) + 
  facet_wrap(~ variable,
             scales='free_y') + 
  scale_color_brewer(palette = 'Set2') + 
  theme_linedraw()

 print(anomaly_date)
 # todo. 
 # choosing where the rate of change in r.squared is a maximum is not the only choice we 
 # might make.
 # we could choose the first inflection point in r.squared where the slope is positive.
	library(tidyverse)
	library(lubridate)
	library(data.table)

	getwd()
	dir = '~/Documents/open_signal/'
	setwd(dir)

	file = 'speed_data.csv'
	df <- read.csv(file,stringsAsFactors = F)


	glimpse(df)
	df = df %>%
	mutate(hour_of_day = hour(hour_read_at),
	day_of_week = wday(hour_read_at))
	df[df$test_type == 'speeds','test_type'] <- 'automatic'
	df$date = as.Date(df$hour_read_at)

	glimpse(df)

	df %>%
	group_by(network_name_mapped, country,date) %>%
	summarise(mn = mean(download_speed)) %>%
	group_by(network_name_mapped, country) %>%
	summarise(mn_mn = mean(mn),
	low_mn = quantile(mn,0.05),
	upp_mn = quantile(mn,0.95)) %T>%
	print %>%
	qplot(data=.
	,x = network_name_mapped
	,y = mn_mn
	,ymin = low_mn
	,ymax = upp_mn
	,geom = 'errorbar') +
	geom_point(size=4,colour='blue') +
	expand_limits(y=0)


	df %>%
	group_by(network_name_mapped, country,test_type,date) %>%
	summarise(mn = mean(download_speed)) %>%
	group_by(network_name_mapped, country,test_type) %>%
	summarise(mn_mn = mean(mn),
	low_mn = quantile(mn,0.05),
	upp_mn = quantile(mn,0.95)) %T>%
	print %>%
	qplot(data=.
	,x = network_name_mapped
	,y = mn_mn
	,ymin = low_mn
	,ymax = upp_mn
	,geom = 'errorbar') + geom_point(size=4,colour='blue') +
	facet_wrap(~test_type) +
	expand_limits(y=0)


	df %>%
	group_by(network_name_mapped, country, test_type) %>%
	summarise(cnt = n()) %>%
	spread(test_type,cnt)


	df %>%
	group_by(network_name_mapped, country,date) %>%
	summarise(mn = mean(download_speed),
	max = max(download_speed),
	min = min(download_speed),
	stddv = sqrt(var(download_speed)),
	med = median(download_speed),
	cnt = length(download_speed)) %T>% print %>%
	gather(variable,value,-network_name_mapped,-country,-date) %>%
	qplot(data=.,
	x = interaction(network_name_mapped,country),
	y = value,
	geom = 'boxplot'
	) + facet_wrap(~ variable,scales = 'free_y') +
	expand_limits(y=0) +
	theme(axis.text.x = element_text(hjust=0, angle=-20))


	df %>%
	group_by(network_name_mapped, country,date,test_type) %>%
	summarise(mn = mean(download_speed),
	max = max(download_speed),
	min = min(download_speed),
	stddv = sqrt(var(download_speed)),
	med = median(download_speed),
	cnt = length(download_speed)) %T>% print %>%
	gather(variable,value,-network_name_mapped,-country,-test_type,-date) %>%
	qplot(data=.,
	x = interaction(network_name_mapped,country),
	y = value,
	group = network_name_mapped,
	colour = test_type,
	geom = 'boxplot'
	) +
	facet_wrap(test_type ~ variable,scales = 'free_y',nrow=2) +
	expand_limits(y=0) +
	theme(axis.text.x = element_text(hjust=0, angle=-20))





	# Reasons:
	- #Geography
	- #Frequency
	- # Time of day / day of week (seasonality)
	#
	{}

	#############################
	# Initial Approach
	# Can we derive a benchmark for the time of day?
	summary.1 =
	df %>%
	group_by(network_name_mapped, country, test_type, hour_of_day) %>%
	summarise(stddev = sqrt(var(download_speed)))

	filter(summary.1, country == 'USA') %>%
	qplot(data=.,
	x = factor(hour_of_day),
	y = stddev,
	colour = test_type) +
	facet_wrap(~ test_type, scales = 'free_y') +
	facet_wrap(~ network_name_mapped)

	glimpse(df)
	model.1 = lm(download_speed ~ hour_of_day + day_of_week, data = df)
	summary(model.1)

	model.2 = lm(download_speed ~ hour_of_day +
	day_of_week +
	test_type +
	network_name_mapped, data = df)
	summary(model.2)


	# Are some data sources obviously generating
	# 'wild' data
	df %>%
	group_by(network_name_mapped, country,test_type, device_id_time) %>%
	summarise(stddev = sqrt(var(log(download_speed)))) %>%
	qplot(data =.,
	x = stddev) +
	facet_wrap(test_type ~ network_name_mapped, scales = 'free_y')

	# Could we just compare the mean for a 90% quantile
	# 'band' against the overall mean?
	# is the data sufficiently well behaved?
	df =
	df %>%
	group_by(date, network_name_mapped) %>%
	mutate( upp = quantile(download_speed, 0.95),
	low = quantile(download_speed, 0.05)) %>%
	mutate( in_band = ifelse(download_speed > low & download_speed < upp,'yes','no'))

	df %>%
	group_by(in_band) %>%
	summarise(n())


	left_join({
	df %>%
	group_by(date, network_name_mapped) %>%
	summarise(mn = mean(download_speed), vr = var(download_speed), cnt = n())
	},{
	df %>%
	filter(in_band == 'yes') %>%
	group_by(date, network_name_mapped) %>%
	summarise(in_band_mn = mean(download_speed), in_band_vr = var(download_speed))
	}
	) %>%
	filter(!is.na(in_band_mn)) %>%
	mutate( is_anomalous = (abs(in_band_mn-mn)/abs(mn)) > 0.2) %>%
	qplot(data=.,
	x=date,
	y=mn,
	group = factor(1),
	colour = is_anomalous,
	geom = 'point') +
	facet_grid(is_anomalous ~ network_name_mapped)

	# Answer: NO !


	# attempt two

	df %>% glimpse

	df %>%
	group_by(device_id_time, test_type, network_name_mapped) %>%
	summarise(cnt=n()) %T>%
	summary %>%
	qplot(data=.,
	x = log(cnt),
	fill = network_name_mapped) +
	facet_wrap(test_type~network_name_mapped,
	scales = 'free_y',
	ncol = 4)

	df %>%
	group_by(date, network_name_mapped) %>%
	summarise(mn = mean(download_speed), vr = var(download_speed), cnt = n()) %>%
	gather(variable,value,-network_name_mapped,-date) %>%
	qplot(data=.,
	x = date,
	y = value,
	group = variable,
	colour = variable,
	geom='point') +
	facet_wrap(variable ~ network_name_mapped,
	scales = 'free_y',
	ncol = 4) +
	expand_limits(y=0)

	# Network 1
	# Stable

	# Network 2 & 4
	# cnt, mean & variance go nuts at a certain point in time

	# Network 3
	# cnt shoots up at a certain point in time (promotional activity?)
	# this clearly has an impact on mean & var but it's not clear that
	# it is unexplained.
	# I'm guessing that perhaps new geographies are being monitored
	# or perhaps certain existing ones more intensively.
	# Since the change is not immediate it's not obvious
	# how to treat this gradual change.
	# What we really want is a measure of 'speed'
	# which is clearly understood in terms of devices & geography.
	#
	# This would then highlight if we have new 'fast' areas.
	# or existing 'fast' areas which are being reported more frequently.


	df %>%
	group_by(date, network_name_mapped,device_id_time) %>%
	summarise(cnt = n()) %>%
	group_by(date, network_name_mapped) %>%
	summarise(mn_cnt = mean(cnt), max_cnt = max(cnt), min_cnt = min(cnt)) %>%
	gather(variable,value,-network_name_mapped,-date) %>%
	qplot(data=.,
	x = date,
	y = value,
	group = variable,
	colour = variable,
	geom='point') +
	facet_wrap(variable ~ network_name_mapped,
	scales = 'free_y',
	ncol = 4) +
	expand_limits(y=0)



	library(broom)

	df.1 =
	df %>%
	group_by(date, network_name_mapped) %>%
	summarise(vr = var(download_speed), mn = mean(download_speed), cnt = n(),man_cnt = sum(ifelse(test_type=='automatic',1,0))) %>%
	gather(variable,value,-date,-network_name_mapped) %>%
	ungroup


	df.1 %>%
	mutate(date_num = as.numeric(date)) %>%
	# filter(date == max(date))
	group_by(network_name_mapped,variable) %>%
	mutate(thresh = ifelse(date_num>(max(date_num)-50),1,0)) %>%
	do( fit = lm(value~date_num+thresh,span=.5,.), data= (.) ) %>%
	augment(fit,data) %>%
	select(date,network_name_mapped,variable,value,fitted = .fitted,thresh) %>%
	qplot(data=.,
	x = date,
	y = value,
	group=network_name_mapped,
	geom='blank') +
	geom_line(aes(x=date,y=fitted,group=network_name_mapped,colour=thresh)) +
	facet_wrap(variable ~ network_name_mapped, scales = 'free_y') +
	expand_limits(y=0)



	dt =
	df %>%
	arrange(date) %>%
	group_by(date, network_name_mapped) %>%
	summarise(vr = var(download_speed),
	mn = mean(download_speed),
	cnt = n(),
	man_vr = var(ifelse(test_type=='automatic',download_speed,NA),na.rm =T),
	man_mn = mean(ifelse(test_type=='automatic',download_speed,NA),na.rm =T),
	man_cnt = sum(ifelse(test_type=='automatic',1,0))
	) %>%
	data.table


	dt.2 = dt[network_name_mapped == 'Netwwork2']

	# Chose today = 195.
	# It should be easy to spot problems now
	(days = dim(dt.2)[1])
	day_ = 195

	# Split the data:
	(prev = dt.2[1:day_-1])
	(curr = dt.2[day_])
	all = rbind(prev,curr)

	samples = dim(all)[1]

	all_mn = mean(all$mn)
	prev_mn = mean(prev$mn)
	curr_mn = mean(curr$mn)


	# total_sum_of_squares
	(sst = sum((all$mn-all_mn)^2))

	# residual_sum_of_squares
	(ssr = sum((prev$mn - prev_mn)^2) + sum((curr$mn - curr_mn)^2))

	# model sum of squares
	(ssm = sst-ssr)

	(r_squared = ssm/ssr)

	dgf = 1 # <- deg_free
	mn_ssm = ssm/dgf
	mn_ssr = ssr/(samples-dgf-1)
	(F_ratio = mn_ssm/mn_ssr)
	# Normal distribution (patently true)
	# F_test =>
	p = 1-pf(F_ratio, df1=1, df2=samples-dgf,lower.tail = T)
	p

	mod.2 =
	all %>%
	mutate(is_new_unexplained_mean = ifelse(date==max(all$date),1,0)) %>%
	lm(mn ~ is_new_unexplained_mean, data=.)
	mod.2.summary = summary(mod.2)
	mod.2.summary$fstatistic
	mod.2.summary


	# https://stackoverflow.com/questions/5587676/pull-out-p-values-and-r-squared-from-a-linear-regression
	get_f_ratio_p <- function (modelobject) {
	if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
	f <- summary(modelobject)$fstatistic
	p <- pf(f[1],f[2],f[3],lower.tail=F)
	attributes(p) <- NULL
	return(p)
	}

	# Set 'today'
	day_indx = 195

	prev_days_to_check = 120
	r_squared_threshold = .25
	f_ratio_threshold = 0.001

	dt.filtered = dt.2[1:day_indx]

	out_list = list()
	for (i in 1:day_indx) {
	dt.filtered[, is_new_unexplained_mean:= 0]
	dt.filtered[i:day_indx, is_new_unexplained_mean:= 1]
	print(dt.filtered)
	mod = lm(mn ~ is_new_unexplained_mean, data=dt.filtered)

	frp = 0; try({frp = get_f_ratio_p(mod)});
	rsqr = summary(mod)$r.squared

	out_list[[i]] = data.frame(date = dt.filtered[i]$date,
	f.ratio.p = frp,
	r.squared = rsqr
	)
	}


	wanted_measures = c('f.ratio.p',
	'mn',
	# 'r.squared',
	'r.squared.loess',
	'r.squared.slope',
	'r.squared.accel'
	)

	res =
	bind_rows(out_list) %>%
	inner_join(dt.filtered) %>%
	mutate(log.f.ratio.p = log(f.ratio.p))


	res$r.squared.loess = predict(loess(r.squared~as.numeric(date),span=.15,data = res))
	res$r.squared.slope = c(0,diff(res$r.squared.loess,differences = 1))
	res$r.squared.accel = c(0,0,diff(res$r.squared.loess,differences = 2))

	anomaly_date =
	res %>%
	slice(max(0,day_indx-prev_days_to_check):day_indx) %>%
	filter(f.ratio.p < f_ratio_threshold) %>%
	filter(r.squared.accel == max(r.squared.accel)) %>%
	select(date)


	res %>%
	slice(2:n()) %>%
	gather(variable,value, -date, -network_name_mapped) %>%
	filter(variable %in% wanted_measures) %>%
	mutate(post_anomaly = date >= unlist(anomaly_date)) %>%
	qplot(data=.,
	x = date,
	y = value,
	colour = post_anomaly,
	group = variable) +
	facet_wrap(~ variable,
	scales='free_y') +
	scale_color_brewer(palette = 'Set2') +
	theme_linedraw()

	print(anomaly_date)
	# todo.
	# choosing where the rate of change in r.squared is a maximum is not the only choice we
	# might make.
	# we could choose the first inflection point in r.squared where the slope is positive.