Created
November 10, 2016 21:50
-
-
Save grosscol/387a3962891952a5f22df9807e2e2221 to your computer and use it in GitHub Desktop.
Selecting a similar distribution
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require('dplyr') | |
require('ggplot2') | |
# Simulate two types of queries: fast and slow. More fast queries. | |
num_samples = 10000 | |
days <- sample(seq(1,30), num_samples, replace=TRUE) | |
qtimes <- rpois(num_samples, c(25,35,100)) | |
qlog <- data.frame(day=days, qtime=qtimes) | |
# take a quick look. | |
ggplot(qlog, aes(day,qtime)) + geom_count() + scale_size_area() | |
summary_of_qtime_day <- qlog %>% | |
dplyr::arrange(day) %>% | |
dplyr::group_by(day) %>% | |
summarise( | |
first_Q = quantile(qtime,.25,na.rm=TRUE), | |
med = median(qtime,na.rm=TRUE), | |
mean = mean(qtime,na.rm=TRUE), | |
third_Q=quantile(qtime,.75,na.rm=TRUE), | |
ninty_Q=quantile(qtime,.9,na.rm=TRUE), | |
nintynine_Q=quantile(qtime,.99,na.rm=TRUE), | |
max = max(qtime,na.rm=TRUE), | |
q_per_day = n() | |
) | |
# Take a look at how close or divergent the median and means are | |
print(summary_of_qtime_day$mean) | |
print(summary_of_qtime_day$med) | |
# Plot | |
p <- ggplot(summary_of_qtime_day, aes(size=3)) + | |
guides(size=FALSE) + | |
labs(x='day', y='value') + | |
scale_color_manual(values=c('red','blue')) + | |
geom_point(aes(x=day, y=mean, color='mean')) + | |
geom_point(aes(x=day,y=med, color='med')) | |
print(p) | |
# Diverging from the new R and dplyr methods here just to get things to work. | |
# There is likely a more elegant dplyr approach to the following, but I don't have time to track it down. | |
## Select the day which has statistics that are the closests to the average statistics. | |
column_mask <- colnames(summary_of_qtime_day) != "day" | |
# Get average of descriptive stats of all the days | |
ave_stats <- colMeans(summary_of_qtime_day[column_mask]) | |
# Sum the residuals for each day and add the column to summary of qtimes | |
residuals <- apply(summary_of_qtime_day[column_mask], MARGIN=1, FUN= function(x){ sum(abs(x-ave_stats)) }) | |
summary_of_qtime_day <- cbind(summary_of_qtime_day, res=residuals) | |
# Choose the day with the lowest sum of residuals | |
min_res_idx = which(summary_of_qtime_day$res == min(summary_of_qtime_day$res), arr.ind=TRUE) | |
matching_ave_day = summary_of_qtime_day[min_res_idx,] | |
print("Closest day log to average") | |
print(matching_ave_day) | |
#summary(by_day) | |
#day first_Q med mean third_Q | |
#Min. : 1 Min. : 0.00 Min. : 0.00 Min. :111.1 Min. : 34.0 | |
#1st Qu.: 55 1st Qu.:30.00 1st Qu.: 90.00 1st Qu.:221.6 1st Qu.:269.0 | |
#Median :109 Median :31.00 Median : 99.00 Median :241.8 Median :286.0 | |
#Mean :109 Mean :30.44 Mean : 97.96 Mean :253.0 Mean :289.6 | |
#3rd Qu.:163 3rd Qu.:31.00 3rd Qu.:105.00 3rd Qu.:263.1 3rd Qu.:303.0 | |
#Max. :217 Max. :34.00 Max. :140.00 Max. :920.4 Max. :910.2 | |
#NA's :1 NA's :1 NA's :1 NA's :1 NA's :1 | |
#ninty_Q nintynine_Q max q_per_day | |
#Min. : 194.1 Min. : 1166 Min. : 6100 Min. : 8 | |
#1st Qu.: 506.0 1st Qu.: 1518 1st Qu.:14887 1st Qu.:12908 | |
#Median : 552.7 Median : 1712 Median :21474 Median :17272 | |
#Mean : 584.9 Mean : 1934 Mean :23514 Mean :18161 | |
#3rd Qu.: 616.0 3rd Qu.: 1959 3rd Qu.:31507 3rd Qu.:20602 | |
#Max. :2438.0 Max. :13521 Max. :44937 Max. :45972 | |
#NA's :1 NA's :1 NA's :1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment