Last active
October 9, 2019 21:08
-
-
Save khakieconomics/63c143ad7c037d4c39d41364b28c4e68 to your computer and use it in GitHub Desktop.
File to ping Github's API, get the commit numbers for applicants to our predoc program, and run a simple DiD on their daily commits before and after the program.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A simple script to grab github commits for a list of users and plot | |
# Group average commits per day for two groups | |
# To use this script, you'll need to set up a yaml file with credentials | |
# for github, and set up a google sheet with columns `Github handle` and | |
#`Attended` (with values "Yes" or "No") | |
# Author: Jim Savage, Schmidt Futures | |
# Load libraries | |
library(tidyverse); library(httr); library(yaml);library(jsonlite); | |
library(googlesheets); library(RCurl); library(lubridate); library(ggthemes) | |
library(lfe) | |
# Load credentials | |
creds <- read_yaml("creds.yaml") | |
gs_auth() | |
# Load the google sheet | |
the_sheet <- gs_url(x = creds$googlesheets$usernames) %>% | |
gs_read(ws = 1) | |
# Filter for program applicants that have a Github profile | |
only_GH_members <- the_sheet %>% | |
filter(!is.na(`Github handle`)) | |
# Function to get commits data for each users | |
get_commits <- function(user) { | |
base_api <- "https://api.github.com/" | |
events <- GET(paste0(base_api, "users/",user), authenticate(creds$github$username, creds$github$pw)) | |
repos <- parse_json(GET(paste0(parse_json(events)$repos_url), authenticate(creds$github$username, creds$github$pw))) | |
commits <- lapply(repos, function(y) { | |
commit_info <- parse_json(GET(paste0(base_api, "repos/", user, "/", y$name, "/commits"), authenticate(creds$github$username, creds$github$pw))) | |
if(length(unlist(commit_info))>2) { | |
if(TRUE){ | |
commit_info <- commit_info[unlist(lapply(commit_info, function(x) x$committer$login == user))] | |
lapply(commit_info, function(x) { | |
tibble(author = x$commit$committer$name, | |
repo = y$name, | |
`Github handle` = user, | |
date = as.POSIXct(x$commit$author$date), | |
comment = x$commit$message) | |
}) %>% bind_rows() | |
} else(tibble()) | |
} else { | |
tibble() | |
} | |
}) %>% bind_rows | |
commits | |
} | |
# Get commits for all applicants with github accounts | |
get_all_commits <- lapply(only_GH_members$`Github handle`, get_commits) | |
# Bind them all together and | |
class_commits <- bind_rows(get_all_commits) %>% | |
left_join(only_GH_members) | |
daily_commits <- class_commits %>% | |
group_by(Attended) %>% | |
group_by(`Github handle`, Month = as.Date(date), Attended) %>% | |
summarise(n = n()) %>% | |
right_join(crossing(`Github handle` = unique(.$`Github handle`), Month = seq(from = as.Date("2019-01-01"), to = Sys.Date(), by = "day"))) %>% | |
group_by(`Github handle`) %>% | |
mutate(Attended = first(Attended[!is.na(Attended)]), | |
Attended = ifelse(is.na(Attended), "No", Attended)) %>% | |
mutate(n = ifelse(is.na(n), 0, n)) %>% | |
mutate(period = ifelse(between(Month, as.Date("2019-01-01"), as.Date("2019-09-19")), "Pre", | |
ifelse(Month > as.Date("2019-09-22"), "Post", "During"))) | |
daily_commits %>% | |
group_by(Month, Attended) %>% | |
summarise(`Average commits`= mean(n)) %>% | |
filter(Month > as.Date("2019-01-01")) %>% | |
ggplot(aes(x = Month, y = `Average commits`, colour = Attended)) + | |
geom_line() + | |
theme_hc() + | |
labs(y = "Average daily commits", | |
x = "Date", | |
title = "Github commits per day", | |
subtitle = "NYU predoc attendees and non-attendee applicants") | |
daily_commits %>% | |
filter(period %in% c("Pre", "Post")) %>% | |
group_by(`Github handle`) %>% | |
mutate(demeaned_commits = n - mean(n)) %>% | |
group_by(period, Attended) %>% | |
summarise(m = mean(demeaned_commits), | |
se = sd(demeaned_commits)/sqrt(n()), | |
n = n()) %>% | |
ungroup %>% | |
mutate(period = relevel(as.factor(period), "Pre")) %>% | |
ggplot(aes(x = period, y = m, colour = Attended)) + | |
geom_linerange(aes(ymin = m - 1.96*se, ymax = m+1.96*se)) + | |
geom_point() + | |
theme_hc() | |
# Simple regression analysis. Fixed effects linear regression on commit counts | |
# with dummies for treated RAs during and after the training. | |
linear_fit <- felm(n ~ I(period=="During" & Attended == "Yes")+ | |
I(period=="Post" & Attended == "Yes") | | |
`Github handle` | 0 | 0, data = daily_commits) | |
summary(linear_fit) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment