benmarwick · April 29, 2023 21:35 · benmarwick · Nov 8, 2018
diff --git a/gistfile1.R b/gistfile1.R
 # get the dates of all emails with a certain search string, and plot

 # install.packages(c("gmailr", "lubridate", "tidyverse"))
 suppressPackageStartupMessages(library(gmailr))
 suppressPackageStartupMessages(library(tidyverse))

 archy_109 <- 
  messages(
    search = 'ARCHY 109',
    num_results = 1000,
    user_id = "me"
  )


 l1 <- length(archy_109) # 10
 l2 <- length((archy_109[[1]]$messages)) # 100

 # set up dataframe to access items in nested list
 df <- cross_df(list(i=1:l1, j=1:l2))

 # extract email dates and email body text
 # this takes several minutes
 df1 <- 
  df %>% 
  mutate(email_date = map2(i, j, ~gmailr::date(message(archy_109[[.x]]$messages[[.y]]$id)))) %>% 
  mutate(email_text = map2(i, j, ~gmailr::message(archy_109[[.x]]$messages[[.y]]$id))) %>% 
  mutate(email_date_out = unlist(email_date))

 # extract sizeEstimate "Estimated size in bytes of the message."
 # https://developers.google.com/gmail/api/v1/reference/users/messages
 # this is very fast!
 df2 <- 
  df1 %>% 
  mutate(idx = 1:nrow(.)) %>% 
  mutate(size_estimate = map_int(idx, ~email_text[[.x]]$sizeEstimate))


 library(lubridate)
 # compute how many emails per day
 archy_109_dates_data <- 
  df2 %>% 
  separate(email_date_out , 
           into = c('dayname', 'day', 'month', 'year', 'time', 'offset'),
           sep = " ") %>% 
  unite(day_hour, 
        c(day, month, year, time), 
        sep = " ", remove =  FALSE) %>% 
  mutate(day_hour = parse_date_time(day_hour, orders = "%d %m %y %H:%M:%S"),
         hour = hour(day_hour)) %>% 
  group_by(month, day) %>% 
  summarise(n = n(),
            mean_size_bytes = mean(size_estimate, na.rm = TRUE)) %>% 
  arrange(month, desc(day)) %>% 
  unite(month_day_hour, 
        c(month, day), 
        sep = " ", remove =  FALSE) %>% 
  mutate(month_day_hour = parse_date_time(month_day_hour, 
                                          orders = "%m %d")) %>% 
  # just this quarter
  filter(month_day_hour > "2018-08-20" & month_day_hour  < "2018-11-14")


 # plot: how many emails per day
 ggplot(archy_109_dates_data, 
       aes(month_day_hour,
           n)) +
  geom_col() +
  geom_vline(colour = "green", 
             size = 2,
             aes(xintercept = as.POSIXct(as.Date("2018-09-26")))) +
  annotate("text", 
           x = as.POSIXct(as.Date("2018-10-03")),
           y = 45,
           size = 7,
           label = "Start of the quarter") +
  geom_vline(colour = "red", 
             size = 2,
             aes(xintercept = as.POSIXct(as.Date("2018-11-08")))) +
  annotate("text", 
           x = as.POSIXct(as.Date("2018-11-01")),
           y = 45,
           size = 7,
           label = "Short essay due") +
  scale_x_datetime(date_labels = "%d %b",
                   name = "Date",
                   date_breaks = "4 days") +
  scale_y_continuous(limits = c(0, 50),
                     name = "Number of ARCHY 109 emails per day") +
  theme_minimal(base_size = 16)

 # plot: email size

 # quick look at distribution of email sizes of emails that I send out
 email_sizes <-
  df2 %>% 
  mutate(from = map_chr(email_text, from)) %>% 
  filter(from == "Ben Marwick <[email protected]>") %>% 
  separate(email_date_out , 
           into = c('dayname', 'day', 'month', 'year', 'time', 'offset'),
           sep = " ") %>% 
  unite(day_hour, 
        c(day, month, year, time), 
        sep = " ", remove =  FALSE) %>% 
  mutate(day_hour = parse_date_time(day_hour, orders = "%d %m %y %H:%M:%S"),
         hour = hour(day_hour)) %>% 
  # days before short essay due date
  mutate(days_before_due_date = day(as.period(as_datetime("2018-11-08") - day_hour) ))

 # check distribution of sizes 
 ggplot(email_sizes, 
       aes(size_estimate)) +
  geom_histogram() +
  scale_x_log10()

 # let's trim off the outliers!
 email_sizes_no_outlier <- 
 email_sizes %>% 
  filter(size_estimate < 1.5e4) %>% 
  filter(days_before_due_date < 100) 

 # show formula on plot
 library(ggpmisc)
 my.formula <- y ~ x

  ggplot(email_sizes_no_outlier,
         aes(days_before_due_date,
           size_estimate)) +
  geom_jitter(size = 5,
              alpha = 0.4) +
  geom_smooth(method = "lm",
              color="black", 
              formula = my.formula) +
  stat_poly_eq(formula = my.formula, 
                aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
               label.x.npc = 'left', 
       label.y.npc = 0.8, 
       size = 5,
                parse = TRUE) +
  stat_fit_glance(method = 'lm',
                       method.args = list(formula = my.formula),
                       geom = 'text',
                       aes(label = paste("P-value = ", 
                                         signif(..p.value.., digits = 4), 
                                         sep = "")),
       label.x.npc = 'left', 
       label.y.npc = 0.75, 
       size = 5) +
  theme_minimal(base_size = 16) +
  scale_x_reverse(limits = c(50, 0)) +
  xlab("Days before the Short Essay due date") +
  ylab("Email size (bytes)") +
  ggtitle("Size of emails I sent relative to time from the Short Essay due date")
	# get the dates of all emails with a certain search string, and plot

	# install.packages(c("gmailr", "lubridate", "tidyverse"))
	suppressPackageStartupMessages(library(gmailr))
	suppressPackageStartupMessages(library(tidyverse))

	archy_109 <-
	messages(
	search = 'ARCHY 109',
	num_results = 1000,
	user_id = "me"
	)


	l1 <- length(archy_109) # 10
	l2 <- length((archy_109[[1]]$messages)) # 100

	# set up dataframe to access items in nested list
	df <- cross_df(list(i=1:l1, j=1:l2))

	# extract email dates and email body text
	# this takes several minutes
	df1 <-
	df %>%
	mutate(email_date = map2(i, j, ~gmailr::date(message(archy_109[[.x]]$messages[[.y]]$id)))) %>%
	mutate(email_text = map2(i, j, ~gmailr::message(archy_109[[.x]]$messages[[.y]]$id))) %>%
	mutate(email_date_out = unlist(email_date))

	# extract sizeEstimate "Estimated size in bytes of the message."
	# https://developers.google.com/gmail/api/v1/reference/users/messages
	# this is very fast!
	df2 <-
	df1 %>%
	mutate(idx = 1:nrow(.)) %>%
	mutate(size_estimate = map_int(idx, ~email_text[[.x]]$sizeEstimate))


	library(lubridate)
	# compute how many emails per day
	archy_109_dates_data <-
	df2 %>%
	separate(email_date_out ,
	into = c('dayname', 'day', 'month', 'year', 'time', 'offset'),
	sep = " ") %>%
	unite(day_hour,
	c(day, month, year, time),
	sep = " ", remove = FALSE) %>%
	mutate(day_hour = parse_date_time(day_hour, orders = "%d %m %y %H:%M:%S"),
	hour = hour(day_hour)) %>%
	group_by(month, day) %>%
	summarise(n = n(),
	mean_size_bytes = mean(size_estimate, na.rm = TRUE)) %>%
	arrange(month, desc(day)) %>%
	unite(month_day_hour,
	c(month, day),
	sep = " ", remove = FALSE) %>%
	mutate(month_day_hour = parse_date_time(month_day_hour,
	orders = "%m %d")) %>%
	# just this quarter
	filter(month_day_hour > "2018-08-20" & month_day_hour < "2018-11-14")


	# plot: how many emails per day
	ggplot(archy_109_dates_data,
	aes(month_day_hour,
	n)) +
	geom_col() +
	geom_vline(colour = "green",
	size = 2,
	aes(xintercept = as.POSIXct(as.Date("2018-09-26")))) +
	annotate("text",
	x = as.POSIXct(as.Date("2018-10-03")),
	y = 45,
	size = 7,
	label = "Start of the quarter") +
	geom_vline(colour = "red",
	size = 2,
	aes(xintercept = as.POSIXct(as.Date("2018-11-08")))) +
	annotate("text",
	x = as.POSIXct(as.Date("2018-11-01")),
	y = 45,
	size = 7,
	label = "Short essay due") +
	scale_x_datetime(date_labels = "%d %b",
	name = "Date",
	date_breaks = "4 days") +
	scale_y_continuous(limits = c(0, 50),
	name = "Number of ARCHY 109 emails per day") +
	theme_minimal(base_size = 16)

	# plot: email size

	# quick look at distribution of email sizes of emails that I send out
	email_sizes <-
	df2 %>%
	mutate(from = map_chr(email_text, from)) %>%
	filter(from == "Ben Marwick <[email protected]>") %>%
	separate(email_date_out ,
	into = c('dayname', 'day', 'month', 'year', 'time', 'offset'),
	sep = " ") %>%
	unite(day_hour,
	c(day, month, year, time),
	sep = " ", remove = FALSE) %>%
	mutate(day_hour = parse_date_time(day_hour, orders = "%d %m %y %H:%M:%S"),
	hour = hour(day_hour)) %>%
	# days before short essay due date
	mutate(days_before_due_date = day(as.period(as_datetime("2018-11-08") - day_hour) ))

	# check distribution of sizes
	ggplot(email_sizes,
	aes(size_estimate)) +
	geom_histogram() +
	scale_x_log10()

	# let's trim off the outliers!
	email_sizes_no_outlier <-
	email_sizes %>%
	filter(size_estimate < 1.5e4) %>%
	filter(days_before_due_date < 100)

	# show formula on plot
	library(ggpmisc)
	my.formula <- y ~ x

	ggplot(email_sizes_no_outlier,
	aes(days_before_due_date,
	size_estimate)) +
	geom_jitter(size = 5,
	alpha = 0.4) +
	geom_smooth(method = "lm",
	color="black",
	formula = my.formula) +
	stat_poly_eq(formula = my.formula,
	aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
	label.x.npc = 'left',
	label.y.npc = 0.8,
	size = 5,
	parse = TRUE) +
	stat_fit_glance(method = 'lm',
	method.args = list(formula = my.formula),
	geom = 'text',
	aes(label = paste("P-value = ",
	signif(..p.value.., digits = 4),
	sep = "")),
	label.x.npc = 'left',
	label.y.npc = 0.75,
	size = 5) +
	theme_minimal(base_size = 16) +
	scale_x_reverse(limits = c(50, 0)) +
	xlab("Days before the Short Essay due date") +
	ylab("Email size (bytes)") +
	ggtitle("Size of emails I sent relative to time from the Short Essay due date")