Created
November 15, 2020 22:05
-
-
Save k5cents/3ec31907f563702317e63a102cb375da to your computer and use it in GitHub Desktop.
Calculate proportion of population represented by Senate majority party over time
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(lubridate) | |
library(jsonlite) | |
library(janitor) | |
library(rvest) | |
# state pop history ------------------------------------------------------- | |
# look for zip export from stl fed | |
# zip has 2016-2019, not on wikipedia | |
fed_zip <- "~/Documents/statepop_txt.zip" | |
# if doesn't exist scrape from wikipedia | |
if (!file.exists(fed_zip)) { | |
# scrape and read 1900-2015 | |
w <- read_html("https://w.wiki/mi3") | |
t <- html_table(html_nodes(w, ".wikitable")) | |
fed <- as_tibble(t[[5]]) %>% | |
rename(year = 1) %>% | |
type_convert( | |
na = "n/a", | |
col_types = cols( | |
.default = col_number(), | |
year = col_integer() | |
) | |
) %>% | |
pivot_longer( | |
cols = !year, | |
names_to = "state", | |
values_to = "population" | |
) %>% | |
filter(!is.na(population)) %>% | |
relocate(state, .before = year) | |
} else { | |
# if zip found, extract and read 1900-2019 | |
fed_file <- unzip(fed_zip, files = "statepop_Annual.txt", exdir = tempdir()) | |
fed <- read_tsv( | |
file = fed_file, | |
col_types = cols( | |
.default = col_double(), | |
DATE = col_date() | |
) | |
) | |
fed <- fed %>% | |
mutate(year = year(DATE), .before = 1, .keep = "unused") %>% | |
pivot_longer( | |
cols = !year, | |
names_to = "state", | |
values_to = "population", | |
names_pattern = "^(.*)POP$", | |
values_drop_na = TRUE | |
) %>% | |
mutate(across(population, `*`, 1000)) %>% | |
filter(state %in% state.abb) | |
} | |
setdiff(state.abb, fed$state) # PR | |
n_distinct(fed$state) # 51 | |
min(fed$year) # 1900 | |
max(fed$year) # 2019 | |
# format 1790-1860 census | |
a <- as_tibble(t[[1]]) %>% | |
select(-2) %>% | |
rename(state = 1) %>% | |
type_convert( | |
na = "", | |
col_types = cols( | |
.default = col_number(), | |
state = col_character() | |
) | |
) %>% | |
pivot_longer( | |
cols = !state, | |
names_to = "year", | |
names_transform = list(year = as.integer), | |
values_to = "population", | |
) %>% | |
mutate(across(state, str_remove, "\\s\\[.*\\]")) %>% | |
filter(state %in% state.name) %>% | |
mutate(across(state, abbrev_state)) | |
min(a$year) # 1790 | |
max(a$year) # 1860 | |
# format 1870-1890 census | |
b <- as_tibble(t[[3]]) %>% | |
rename(state = 1) %>% | |
type_convert( | |
na = "", | |
col_types = cols( | |
.default = col_number(), | |
state = col_character() | |
) | |
) %>% | |
pivot_longer( | |
cols = !state, | |
names_to = "year", | |
names_transform = list(year = as.integer), | |
values_to = "population", | |
) %>% | |
mutate(across(state, str_remove, "\\s\\[.*\\]")) %>% | |
filter(state %in% state.name, year < min(fed$year)) %>% | |
mutate(across(state, abbrev_state)) | |
min(b$year) # 1870 | |
max(b$year) # 1890 | |
# combine all years | |
pop <- bind_rows(fed, a, b) %>% | |
arrange(year, state) %>% | |
filter(!is.na(population)) | |
# impute the years ======================================================== | |
# create all years until 1900 (fed) | |
miss_years <- expand_grid( | |
year = 1789:1900, | |
state = unique(pop$state) | |
) %>% | |
filter(year %out% pop$year) %>% | |
arrange(state) | |
# find year joined union | |
min_years <- pop %>% | |
select(-population) %>% | |
group_by(state) %>% | |
filter(year == min(year)) %>% | |
rename(min_year = year) %>% | |
ungroup() %>% | |
arrange(state) | |
min_years[min_years == 1790] <- 1789 | |
# remove years before joined | |
miss_years <- miss_years %>% | |
left_join(min_years) %>% | |
filter(year >= min_year) %>% | |
select(-min_year) | |
# linear approximate missing values | |
approx_pop <- function(x) { | |
approxfun(seq_along(x), x, method = "linear")(seq_along(x)) | |
} | |
# apply to states | |
pop <- pop %>% | |
bind_rows(miss_years) %>% | |
arrange(year) %>% | |
group_by(state) %>% | |
mutate(population = approx_pop(population)) %>% | |
ungroup() %>% | |
arrange(state) | |
pop$population[pop$year == 1789] <- pop$population[pop$year == 1790] | |
pop %>% | |
ggplot(aes(year, population)) + | |
geom_line(aes(color = state)) + | |
theme(legend.position = "none") | |
rm(a, b, fed, miss_years, min_years) | |
write_tsv(pop, file = "~/Documents/state_pop.tsv") | |
# sen control hist -------------------------------------------------------- | |
# get a list of legislators | |
usio_cl <- "https://theunitedstates.io/congress-legislators/" | |
# get it from json for terms | |
leg <- fromJSON( | |
txt = paste0(usio_cl, "legislators-historical.json"), | |
simplifyDataFrame = TRUE | |
) | |
# create table of names | |
who <- tibble( | |
wid = leg$id$wikidata, | |
last = leg$name$last, | |
term_id = seq_along(leg$terms) | |
) | |
# create table of all terms served | |
terms <- leg$terms %>% | |
map_df(as_tibble, .id = "term_id") %>% | |
select(term_id, type, start, end, state, class, party) %>% | |
mutate(across(term_id, as.integer)) %>% | |
mutate(across(c(start, end), parse_date)) | |
# repeat legislators for every term | |
leg <- who %>% | |
left_join(terms, by = "term_id") %>% | |
# keep only senators | |
filter(type == "sen") %>% | |
select(-type) | |
# repeat for 116 congress ================================================= | |
leg_116 <- fromJSON( | |
txt = paste0(usio_cl, "legislators-current.json"), | |
simplifyDataFrame = TRUE | |
) | |
who <- tibble( | |
wid = leg_116$id$wikidata, | |
last = leg_116$name$last, | |
term_id = seq_along(leg_116$terms) | |
) | |
terms <- leg_116$terms %>% | |
map_df(as_tibble, .id = "term_id") %>% | |
select(term_id, type, start, end, state, class, party) %>% | |
mutate(across(term_id, as.integer)) %>% | |
mutate(across(c(start, end), parse_date)) | |
leg_116 <- who %>% | |
left_join(terms, by = "term_id") %>% | |
# keep only senators | |
filter(type == "sen") %>% | |
select(-type) | |
# add 116 to historical | |
leg <- bind_rows(leg, leg_116) | |
write_tsv(leg, "~/Documents/sen_member.tsv") | |
rm(who, terms, leg_116) | |
# find congresses --------------------------------------------------------- | |
# scrape all sessions of congress | |
sesh <- | |
read_html("https://w.wiki/miB") %>% | |
html_nodes(".wikitable") %>% | |
html_table() %>% | |
map(rename, `Congress began` = 2, `Congress ended` = 5) %>% | |
map_df(as_tibble) %>% | |
clean_names() %>% | |
separate( | |
col = session_dates, | |
into = c("session_began", "session_ended"), | |
sep = "\\s–\\s" | |
) %>% | |
type_convert( | |
col_types = cols( | |
congress = col_number(), | |
congress_began = col_date("%B %d, %Y"), | |
session = col_number(), | |
session_began = col_date("%B %d, %Y"), | |
session_ended = col_date("%B %d, %Y"), | |
congress_ended = col_date("%B %d, %Y") | |
) | |
) | |
write_tsv(sesh, "~/Documents/con_sessions.tsv") | |
# create interval columns | |
sesh <- sesh %>% | |
mutate( | |
congress_date = interval(congress_began, congress_ended), | |
session_date = interval(session_began, session_ended), | |
) %>% | |
select(congress, congress_date, session, session_date) | |
# keep only congresses | |
cons <- sesh %>% | |
select(congress, interval = congress_date) %>% | |
distinct() | |
# join data --------------------------------------------------------------- | |
# add the congress each term started *within* | |
term_con <- rep(NA, nrow(leg)) | |
for (i in seq_along(term_con)) { | |
# use interval for senators joining mid congress | |
term_con[i] <- max(which(leg$start[i] %within% cons$interval)) | |
} | |
leg <- mutate(leg, congress = term_con) | |
# calculate party control by congress | |
con_party <- leg %>% | |
group_by(congress) %>% | |
count(party, sort = TRUE) %>% | |
mutate(prop = n/sum(n)) %>% | |
arrange(congress) %>% | |
mutate(majority = n == max(n), .before = n) | |
# add majority party flag to terms | |
leg <- left_join(leg, con_party[, 1:3]) | |
leg %>% | |
mutate(year = year(start)) %>% | |
group_by(year, congress, state) %>% | |
summarise(in_maj = any(majority)) %>% | |
left_join(pop) %>% | |
arrange(congress, in_maj) %>% | |
group_by(year) %>% | |
mutate(prop_pop = population/sum(population)) %>% | |
group_by(year, majority = in_maj) %>% | |
summarise(prop_pop = sum(prop_pop)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment