Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active April 29, 2023 21:35
Show Gist options
  • Save benmarwick/e4fa3fdcbf328d6ef6e38f219e9f8a2a to your computer and use it in GitHub Desktop.
Save benmarwick/e4fa3fdcbf328d6ef6e38f219e9f8a2a to your computer and use it in GitHub Desktop.
Plot time series of emails resulting from a search, in this case my emails about ARCHY 109 during the Fall quarter of 2018
# get the dates of all emails with a certain search string, and plot
# install.packages(c("gmailr", "lubridate", "tidyverse"))
suppressPackageStartupMessages(library(gmailr))
suppressPackageStartupMessages(library(tidyverse))
archy_109 <-
messages(
search = 'ARCHY 109',
num_results = 1000,
user_id = "me"
)
l1 <- length(archy_109) # 10
l2 <- length((archy_109[[1]]$messages)) # 100
# set up dataframe to access items in nested list
df <- cross_df(list(i=1:l1, j=1:l2))
# extract email dates and email body text
# this takes several minutes
df1 <-
df %>%
mutate(email_date = map2(i, j, ~gmailr::date(message(archy_109[[.x]]$messages[[.y]]$id)))) %>%
mutate(email_text = map2(i, j, ~gmailr::message(archy_109[[.x]]$messages[[.y]]$id))) %>%
mutate(email_date_out = unlist(email_date))
# extract sizeEstimate "Estimated size in bytes of the message."
# https://developers.google.com/gmail/api/v1/reference/users/messages
# this is very fast!
df2 <-
df1 %>%
mutate(idx = 1:nrow(.)) %>%
mutate(size_estimate = map_int(idx, ~email_text[[.x]]$sizeEstimate))
library(lubridate)
# compute how many emails per day
archy_109_dates_data <-
df2 %>%
separate(email_date_out ,
into = c('dayname', 'day', 'month', 'year', 'time', 'offset'),
sep = " ") %>%
unite(day_hour,
c(day, month, year, time),
sep = " ", remove = FALSE) %>%
mutate(day_hour = parse_date_time(day_hour, orders = "%d %m %y %H:%M:%S"),
hour = hour(day_hour)) %>%
group_by(month, day) %>%
summarise(n = n(),
mean_size_bytes = mean(size_estimate, na.rm = TRUE)) %>%
arrange(month, desc(day)) %>%
unite(month_day_hour,
c(month, day),
sep = " ", remove = FALSE) %>%
mutate(month_day_hour = parse_date_time(month_day_hour,
orders = "%m %d")) %>%
# just this quarter
filter(month_day_hour > "2018-08-20" & month_day_hour < "2018-11-14")
# plot: how many emails per day
ggplot(archy_109_dates_data,
aes(month_day_hour,
n)) +
geom_col() +
geom_vline(colour = "green",
size = 2,
aes(xintercept = as.POSIXct(as.Date("2018-09-26")))) +
annotate("text",
x = as.POSIXct(as.Date("2018-10-03")),
y = 45,
size = 7,
label = "Start of the quarter") +
geom_vline(colour = "red",
size = 2,
aes(xintercept = as.POSIXct(as.Date("2018-11-08")))) +
annotate("text",
x = as.POSIXct(as.Date("2018-11-01")),
y = 45,
size = 7,
label = "Short essay due") +
scale_x_datetime(date_labels = "%d %b",
name = "Date",
date_breaks = "4 days") +
scale_y_continuous(limits = c(0, 50),
name = "Number of ARCHY 109 emails per day") +
theme_minimal(base_size = 16)
# plot: email size
# quick look at distribution of email sizes of emails that I send out
email_sizes <-
df2 %>%
mutate(from = map_chr(email_text, from)) %>%
filter(from == "Ben Marwick <[email protected]>") %>%
separate(email_date_out ,
into = c('dayname', 'day', 'month', 'year', 'time', 'offset'),
sep = " ") %>%
unite(day_hour,
c(day, month, year, time),
sep = " ", remove = FALSE) %>%
mutate(day_hour = parse_date_time(day_hour, orders = "%d %m %y %H:%M:%S"),
hour = hour(day_hour)) %>%
# days before short essay due date
mutate(days_before_due_date = day(as.period(as_datetime("2018-11-08") - day_hour) ))
# check distribution of sizes
ggplot(email_sizes,
aes(size_estimate)) +
geom_histogram() +
scale_x_log10()
# let's trim off the outliers!
email_sizes_no_outlier <-
email_sizes %>%
filter(size_estimate < 1.5e4) %>%
filter(days_before_due_date < 100)
# show formula on plot
library(ggpmisc)
my.formula <- y ~ x
ggplot(email_sizes_no_outlier,
aes(days_before_due_date,
size_estimate)) +
geom_jitter(size = 5,
alpha = 0.4) +
geom_smooth(method = "lm",
color="black",
formula = my.formula) +
stat_poly_eq(formula = my.formula,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
label.x.npc = 'left',
label.y.npc = 0.8,
size = 5,
parse = TRUE) +
stat_fit_glance(method = 'lm',
method.args = list(formula = my.formula),
geom = 'text',
aes(label = paste("P-value = ",
signif(..p.value.., digits = 4),
sep = "")),
label.x.npc = 'left',
label.y.npc = 0.75,
size = 5) +
theme_minimal(base_size = 16) +
scale_x_reverse(limits = c(50, 0)) +
xlab("Days before the Short Essay due date") +
ylab("Email size (bytes)") +
ggtitle("Size of emails I sent relative to time from the Short Essay due date")
@benmarwick
Copy link
Author

benmarwick commented Nov 8, 2018

image

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment