Skip to content

Instantly share code, notes, and snippets.

@k5cents
Created November 15, 2020 22:05
Show Gist options
  • Save k5cents/3ec31907f563702317e63a102cb375da to your computer and use it in GitHub Desktop.
Save k5cents/3ec31907f563702317e63a102cb375da to your computer and use it in GitHub Desktop.
Calculate proportion of population represented by Senate majority party over time
library(tidyverse)
library(lubridate)
library(jsonlite)
library(janitor)
library(rvest)
# state pop history -------------------------------------------------------
# look for zip export from stl fed
# zip has 2016-2019, not on wikipedia
fed_zip <- "~/Documents/statepop_txt.zip"
# if doesn't exist scrape from wikipedia
if (!file.exists(fed_zip)) {
# scrape and read 1900-2015
w <- read_html("https://w.wiki/mi3")
t <- html_table(html_nodes(w, ".wikitable"))
fed <- as_tibble(t[[5]]) %>%
rename(year = 1) %>%
type_convert(
na = "n/a",
col_types = cols(
.default = col_number(),
year = col_integer()
)
) %>%
pivot_longer(
cols = !year,
names_to = "state",
values_to = "population"
) %>%
filter(!is.na(population)) %>%
relocate(state, .before = year)
} else {
# if zip found, extract and read 1900-2019
fed_file <- unzip(fed_zip, files = "statepop_Annual.txt", exdir = tempdir())
fed <- read_tsv(
file = fed_file,
col_types = cols(
.default = col_double(),
DATE = col_date()
)
)
fed <- fed %>%
mutate(year = year(DATE), .before = 1, .keep = "unused") %>%
pivot_longer(
cols = !year,
names_to = "state",
values_to = "population",
names_pattern = "^(.*)POP$",
values_drop_na = TRUE
) %>%
mutate(across(population, `*`, 1000)) %>%
filter(state %in% state.abb)
}
setdiff(state.abb, fed$state) # PR
n_distinct(fed$state) # 51
min(fed$year) # 1900
max(fed$year) # 2019
# format 1790-1860 census
a <- as_tibble(t[[1]]) %>%
select(-2) %>%
rename(state = 1) %>%
type_convert(
na = "",
col_types = cols(
.default = col_number(),
state = col_character()
)
) %>%
pivot_longer(
cols = !state,
names_to = "year",
names_transform = list(year = as.integer),
values_to = "population",
) %>%
mutate(across(state, str_remove, "\\s\\[.*\\]")) %>%
filter(state %in% state.name) %>%
mutate(across(state, abbrev_state))
min(a$year) # 1790
max(a$year) # 1860
# format 1870-1890 census
b <- as_tibble(t[[3]]) %>%
rename(state = 1) %>%
type_convert(
na = "",
col_types = cols(
.default = col_number(),
state = col_character()
)
) %>%
pivot_longer(
cols = !state,
names_to = "year",
names_transform = list(year = as.integer),
values_to = "population",
) %>%
mutate(across(state, str_remove, "\\s\\[.*\\]")) %>%
filter(state %in% state.name, year < min(fed$year)) %>%
mutate(across(state, abbrev_state))
min(b$year) # 1870
max(b$year) # 1890
# combine all years
pop <- bind_rows(fed, a, b) %>%
arrange(year, state) %>%
filter(!is.na(population))
# impute the years ========================================================
# create all years until 1900 (fed)
miss_years <- expand_grid(
year = 1789:1900,
state = unique(pop$state)
) %>%
filter(year %out% pop$year) %>%
arrange(state)
# find year joined union
min_years <- pop %>%
select(-population) %>%
group_by(state) %>%
filter(year == min(year)) %>%
rename(min_year = year) %>%
ungroup() %>%
arrange(state)
min_years[min_years == 1790] <- 1789
# remove years before joined
miss_years <- miss_years %>%
left_join(min_years) %>%
filter(year >= min_year) %>%
select(-min_year)
# linear approximate missing values
approx_pop <- function(x) {
approxfun(seq_along(x), x, method = "linear")(seq_along(x))
}
# apply to states
pop <- pop %>%
bind_rows(miss_years) %>%
arrange(year) %>%
group_by(state) %>%
mutate(population = approx_pop(population)) %>%
ungroup() %>%
arrange(state)
pop$population[pop$year == 1789] <- pop$population[pop$year == 1790]
pop %>%
ggplot(aes(year, population)) +
geom_line(aes(color = state)) +
theme(legend.position = "none")
rm(a, b, fed, miss_years, min_years)
write_tsv(pop, file = "~/Documents/state_pop.tsv")
# sen control hist --------------------------------------------------------
# get a list of legislators
usio_cl <- "https://theunitedstates.io/congress-legislators/"
# get it from json for terms
leg <- fromJSON(
txt = paste0(usio_cl, "legislators-historical.json"),
simplifyDataFrame = TRUE
)
# create table of names
who <- tibble(
wid = leg$id$wikidata,
last = leg$name$last,
term_id = seq_along(leg$terms)
)
# create table of all terms served
terms <- leg$terms %>%
map_df(as_tibble, .id = "term_id") %>%
select(term_id, type, start, end, state, class, party) %>%
mutate(across(term_id, as.integer)) %>%
mutate(across(c(start, end), parse_date))
# repeat legislators for every term
leg <- who %>%
left_join(terms, by = "term_id") %>%
# keep only senators
filter(type == "sen") %>%
select(-type)
# repeat for 116 congress =================================================
leg_116 <- fromJSON(
txt = paste0(usio_cl, "legislators-current.json"),
simplifyDataFrame = TRUE
)
who <- tibble(
wid = leg_116$id$wikidata,
last = leg_116$name$last,
term_id = seq_along(leg_116$terms)
)
terms <- leg_116$terms %>%
map_df(as_tibble, .id = "term_id") %>%
select(term_id, type, start, end, state, class, party) %>%
mutate(across(term_id, as.integer)) %>%
mutate(across(c(start, end), parse_date))
leg_116 <- who %>%
left_join(terms, by = "term_id") %>%
# keep only senators
filter(type == "sen") %>%
select(-type)
# add 116 to historical
leg <- bind_rows(leg, leg_116)
write_tsv(leg, "~/Documents/sen_member.tsv")
rm(who, terms, leg_116)
# find congresses ---------------------------------------------------------
# scrape all sessions of congress
sesh <-
read_html("https://w.wiki/miB") %>%
html_nodes(".wikitable") %>%
html_table() %>%
map(rename, `Congress began` = 2, `Congress ended` = 5) %>%
map_df(as_tibble) %>%
clean_names() %>%
separate(
col = session_dates,
into = c("session_began", "session_ended"),
sep = "\\s–\\s"
) %>%
type_convert(
col_types = cols(
congress = col_number(),
congress_began = col_date("%B %d, %Y"),
session = col_number(),
session_began = col_date("%B %d, %Y"),
session_ended = col_date("%B %d, %Y"),
congress_ended = col_date("%B %d, %Y")
)
)
write_tsv(sesh, "~/Documents/con_sessions.tsv")
# create interval columns
sesh <- sesh %>%
mutate(
congress_date = interval(congress_began, congress_ended),
session_date = interval(session_began, session_ended),
) %>%
select(congress, congress_date, session, session_date)
# keep only congresses
cons <- sesh %>%
select(congress, interval = congress_date) %>%
distinct()
# join data ---------------------------------------------------------------
# add the congress each term started *within*
term_con <- rep(NA, nrow(leg))
for (i in seq_along(term_con)) {
# use interval for senators joining mid congress
term_con[i] <- max(which(leg$start[i] %within% cons$interval))
}
leg <- mutate(leg, congress = term_con)
# calculate party control by congress
con_party <- leg %>%
group_by(congress) %>%
count(party, sort = TRUE) %>%
mutate(prop = n/sum(n)) %>%
arrange(congress) %>%
mutate(majority = n == max(n), .before = n)
# add majority party flag to terms
leg <- left_join(leg, con_party[, 1:3])
leg %>%
mutate(year = year(start)) %>%
group_by(year, congress, state) %>%
summarise(in_maj = any(majority)) %>%
left_join(pop) %>%
arrange(congress, in_maj) %>%
group_by(year) %>%
mutate(prop_pop = population/sum(population)) %>%
group_by(year, majority = in_maj) %>%
summarise(prop_pop = sum(prop_pop))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment