Last active
January 21, 2016 01:08
-
-
Save jeffreyiacono/1bee231519346d82c0bb to your computer and use it in GitHub Desktop.
Plot percentiles by boundaries
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
options(repos = c(CRAN = "http://cran.rstudio.com")) | |
install.packages("ggplot2") | |
install.packages("dplyr") | |
require(ggplot2) | |
require(dplyr) | |
points <- 10000 | |
buckets <- 20 | |
quantile_splits <- c(0, 0.25, 0.5, 0.75, 1) | |
ntile_labels <- c("0%-tile", "25%-tile", "50%-tile", "75%-tile", "99%-tile", "100%-tile") | |
ntile_colors <- scales::hue_pal()(length(ntile_labels)) | |
names(ntile_colors) <- ntile_labels | |
# create sequential Xs and normally distributed Ys the have a 4x positive skew | |
# why? just for fun and it looks cooler! | |
x <- seq(0, points - 1, by = 1) | |
y <- rnorm(points, mean = 0, sd = 1) | |
jitter <- rnorm(length(x), mean = 0, sd = 500) | |
# create dataframe + ntiles | |
df <- data.frame( | |
x = x, | |
scaled_y = (-1 / 10000) * x ** 2 + jitter * (1 + x / 1000) | |
) | |
df$x_ntile <- ntile(df$x, buckets) | |
# group and calculate | |
df <- df %>% | |
group_by(x_ntile) %>% | |
mutate( | |
y_ntile_0 = quantile(scaled_y, quantile_splits)[1], | |
y_ntile_025 = quantile(scaled_y, quantile_splits)[2], | |
y_ntile_05 = quantile(scaled_y, quantile_splits)[3], | |
y_ntile_075 = quantile(scaled_y, quantile_splits)[4], | |
y_ntile_099 = quantile(scaled_y, c(0.99))[1], | |
y_ntile_1 = quantile(scaled_y, quantile_splits)[5], | |
y_ntile = factor( | |
ntile(scaled_y, 4), | |
labels = c("0-25%", "25-50%", "50-75%", "75-100%"), | |
ordered = TRUE | |
) | |
) | |
# group and calculate | |
ggplot( | |
data = df, | |
aes(x = x) | |
) + | |
geom_point( | |
aes( | |
y = scaled_y, | |
color = y_ntile | |
), | |
alpha = 0.3 | |
) + | |
geom_line( | |
aes( | |
y = y_ntile_0 | |
), | |
color = ntile_colors[1] | |
) + | |
annotate( | |
geom = "text", | |
x = points, | |
y = df[points, ]$y_ntile_0, | |
label = ntile_labels[1], | |
hjust = 0 | |
) + | |
geom_line( | |
aes( | |
y = y_ntile_025 | |
), | |
color = ntile_colors[2] | |
) + | |
annotate( | |
geom = "text", | |
x = points, | |
y = df[points, ]$y_ntile_025, | |
label = ntile_labels[2], | |
hjust = 0 | |
) + | |
geom_line( | |
aes( | |
y = y_ntile_05 | |
), | |
color = ntile_colors[3] | |
) + | |
annotate( | |
geom = "text", | |
x = points, | |
y = df[points, ]$y_ntile_05, | |
label = ntile_labels[3], | |
hjust = 0 | |
) + | |
geom_line( | |
aes( | |
y = y_ntile_075 | |
), | |
color = ntile_colors[4] | |
) + | |
annotate( | |
geom = "text", | |
x = points, | |
y = df[points, ]$y_ntile_075, | |
label = ntile_labels[4], | |
hjust = 0 | |
) + | |
geom_line( | |
aes( | |
y = y_ntile_099 | |
), | |
color = ntile_colors[5] | |
) + | |
annotate( | |
geom = "text", | |
x = points, | |
y = df[points, ]$y_ntile_099, | |
label = ntile_labels[5], | |
hjust = 0 | |
) + | |
geom_line( | |
aes( | |
y = y_ntile_1 | |
), | |
color = ntile_colors[6] | |
) + | |
annotate( | |
geom = "text", | |
x = points, | |
y = df[points, ]$y_ntile_1, | |
label = ntile_labels[6], | |
hjust = 0 | |
) + | |
scale_color_discrete( | |
guide = guide_legend( | |
title = "Percentiles", | |
override.aes = list(alpha = 1) | |
) | |
) + | |
scale_x_continuous( | |
breaks = seq(0, points, by = points / buckets), | |
limits = c(1, points) | |
) + | |
labs( | |
x = "Xs", | |
y = "some random Ys" | |
) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment