Last active
April 29, 2023 21:35
-
-
Save benmarwick/e4fa3fdcbf328d6ef6e38f219e9f8a2a to your computer and use it in GitHub Desktop.
Plot time series of emails resulting from a search, in this case my emails about ARCHY 109 during the Fall quarter of 2018
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get the dates of all emails with a certain search string, and plot | |
# install.packages(c("gmailr", "lubridate", "tidyverse")) | |
suppressPackageStartupMessages(library(gmailr)) | |
suppressPackageStartupMessages(library(tidyverse)) | |
archy_109 <- | |
messages( | |
search = 'ARCHY 109', | |
num_results = 1000, | |
user_id = "me" | |
) | |
l1 <- length(archy_109) # 10 | |
l2 <- length((archy_109[[1]]$messages)) # 100 | |
# set up dataframe to access items in nested list | |
df <- cross_df(list(i=1:l1, j=1:l2)) | |
# extract email dates and email body text | |
# this takes several minutes | |
df1 <- | |
df %>% | |
mutate(email_date = map2(i, j, ~gmailr::date(message(archy_109[[.x]]$messages[[.y]]$id)))) %>% | |
mutate(email_text = map2(i, j, ~gmailr::message(archy_109[[.x]]$messages[[.y]]$id))) %>% | |
mutate(email_date_out = unlist(email_date)) | |
# extract sizeEstimate "Estimated size in bytes of the message." | |
# https://developers.google.com/gmail/api/v1/reference/users/messages | |
# this is very fast! | |
df2 <- | |
df1 %>% | |
mutate(idx = 1:nrow(.)) %>% | |
mutate(size_estimate = map_int(idx, ~email_text[[.x]]$sizeEstimate)) | |
library(lubridate) | |
# compute how many emails per day | |
archy_109_dates_data <- | |
df2 %>% | |
separate(email_date_out , | |
into = c('dayname', 'day', 'month', 'year', 'time', 'offset'), | |
sep = " ") %>% | |
unite(day_hour, | |
c(day, month, year, time), | |
sep = " ", remove = FALSE) %>% | |
mutate(day_hour = parse_date_time(day_hour, orders = "%d %m %y %H:%M:%S"), | |
hour = hour(day_hour)) %>% | |
group_by(month, day) %>% | |
summarise(n = n(), | |
mean_size_bytes = mean(size_estimate, na.rm = TRUE)) %>% | |
arrange(month, desc(day)) %>% | |
unite(month_day_hour, | |
c(month, day), | |
sep = " ", remove = FALSE) %>% | |
mutate(month_day_hour = parse_date_time(month_day_hour, | |
orders = "%m %d")) %>% | |
# just this quarter | |
filter(month_day_hour > "2018-08-20" & month_day_hour < "2018-11-14") | |
# plot: how many emails per day | |
ggplot(archy_109_dates_data, | |
aes(month_day_hour, | |
n)) + | |
geom_col() + | |
geom_vline(colour = "green", | |
size = 2, | |
aes(xintercept = as.POSIXct(as.Date("2018-09-26")))) + | |
annotate("text", | |
x = as.POSIXct(as.Date("2018-10-03")), | |
y = 45, | |
size = 7, | |
label = "Start of the quarter") + | |
geom_vline(colour = "red", | |
size = 2, | |
aes(xintercept = as.POSIXct(as.Date("2018-11-08")))) + | |
annotate("text", | |
x = as.POSIXct(as.Date("2018-11-01")), | |
y = 45, | |
size = 7, | |
label = "Short essay due") + | |
scale_x_datetime(date_labels = "%d %b", | |
name = "Date", | |
date_breaks = "4 days") + | |
scale_y_continuous(limits = c(0, 50), | |
name = "Number of ARCHY 109 emails per day") + | |
theme_minimal(base_size = 16) | |
# plot: email size | |
# quick look at distribution of email sizes of emails that I send out | |
email_sizes <- | |
df2 %>% | |
mutate(from = map_chr(email_text, from)) %>% | |
filter(from == "Ben Marwick <[email protected]>") %>% | |
separate(email_date_out , | |
into = c('dayname', 'day', 'month', 'year', 'time', 'offset'), | |
sep = " ") %>% | |
unite(day_hour, | |
c(day, month, year, time), | |
sep = " ", remove = FALSE) %>% | |
mutate(day_hour = parse_date_time(day_hour, orders = "%d %m %y %H:%M:%S"), | |
hour = hour(day_hour)) %>% | |
# days before short essay due date | |
mutate(days_before_due_date = day(as.period(as_datetime("2018-11-08") - day_hour) )) | |
# check distribution of sizes | |
ggplot(email_sizes, | |
aes(size_estimate)) + | |
geom_histogram() + | |
scale_x_log10() | |
# let's trim off the outliers! | |
email_sizes_no_outlier <- | |
email_sizes %>% | |
filter(size_estimate < 1.5e4) %>% | |
filter(days_before_due_date < 100) | |
# show formula on plot | |
library(ggpmisc) | |
my.formula <- y ~ x | |
ggplot(email_sizes_no_outlier, | |
aes(days_before_due_date, | |
size_estimate)) + | |
geom_jitter(size = 5, | |
alpha = 0.4) + | |
geom_smooth(method = "lm", | |
color="black", | |
formula = my.formula) + | |
stat_poly_eq(formula = my.formula, | |
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), | |
label.x.npc = 'left', | |
label.y.npc = 0.8, | |
size = 5, | |
parse = TRUE) + | |
stat_fit_glance(method = 'lm', | |
method.args = list(formula = my.formula), | |
geom = 'text', | |
aes(label = paste("P-value = ", | |
signif(..p.value.., digits = 4), | |
sep = "")), | |
label.x.npc = 'left', | |
label.y.npc = 0.75, | |
size = 5) + | |
theme_minimal(base_size = 16) + | |
scale_x_reverse(limits = c(50, 0)) + | |
xlab("Days before the Short Essay due date") + | |
ylab("Email size (bytes)") + | |
ggtitle("Size of emails I sent relative to time from the Short Essay due date") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.