k5cents · November 15, 2020 22:05
diff --git a/sen_pop.R b/sen_pop.R
 library(tidyverse)
 library(lubridate)
 library(jsonlite)
 library(janitor)
 library(rvest)

 # state pop history -------------------------------------------------------

 # look for zip export from stl fed
 # zip has 2016-2019, not on wikipedia
 fed_zip <- "~/Documents/statepop_txt.zip"
 # if doesn't exist scrape from wikipedia
 if (!file.exists(fed_zip)) {
  # scrape and read 1900-2015
  w <- read_html("https://w.wiki/mi3")
  t <- html_table(html_nodes(w, ".wikitable"))
  fed <- as_tibble(t[[5]]) %>% 
    rename(year = 1) %>% 
    type_convert(
      na = "n/a",
      col_types = cols(
        .default = col_number(),
        year = col_integer()
      )
    ) %>% 
    pivot_longer(
      cols = !year,
      names_to = "state",
      values_to = "population"
    ) %>% 
    filter(!is.na(population)) %>% 
    relocate(state, .before = year)
 } else {
  # if zip found, extract and read 1900-2019
  fed_file <- unzip(fed_zip, files = "statepop_Annual.txt", exdir = tempdir())
  fed <- read_tsv(
    file = fed_file,
    col_types = cols(
      .default = col_double(),
      DATE = col_date()
    )
  )
  fed <- fed %>% 
    mutate(year = year(DATE), .before = 1, .keep = "unused") %>% 
    pivot_longer(
      cols = !year,
      names_to = "state",
      values_to = "population",
      names_pattern = "^(.*)POP$",
      values_drop_na = TRUE
    ) %>% 
    mutate(across(population, `*`, 1000)) %>% 
    filter(state %in% state.abb)
 }

 setdiff(state.abb, fed$state) # PR
 n_distinct(fed$state) # 51
 min(fed$year) # 1900
 max(fed$year) # 2019

 # format 1790-1860 census
 a <- as_tibble(t[[1]]) %>%
  select(-2) %>% 
  rename(state = 1) %>% 
  type_convert(
    na = "",
    col_types = cols(
      .default = col_number(),
      state = col_character()
    )
  ) %>% 
  pivot_longer(
    cols = !state,
    names_to = "year",
    names_transform = list(year = as.integer),
    values_to = "population",
  ) %>% 
  mutate(across(state, str_remove, "\\s\\[.*\\]")) %>% 
  filter(state %in% state.name) %>% 
  mutate(across(state, abbrev_state))

 min(a$year) # 1790
 max(a$year) # 1860

 # format 1870-1890 census
 b <- as_tibble(t[[3]]) %>%
  rename(state = 1) %>% 
  type_convert(
    na = "",
    col_types = cols(
      .default = col_number(),
      state = col_character()
    )
  ) %>% 
  pivot_longer(
    cols = !state,
    names_to = "year",
    names_transform = list(year = as.integer),
    values_to = "population",
  ) %>% 
  mutate(across(state, str_remove, "\\s\\[.*\\]")) %>% 
  filter(state %in% state.name, year < min(fed$year)) %>% 
  mutate(across(state, abbrev_state))

 min(b$year) # 1870
 max(b$year) # 1890

 # combine all years
 pop <- bind_rows(fed, a, b) %>% 
  arrange(year, state) %>% 
  filter(!is.na(population))

 # impute the years ========================================================

 # create all years until 1900 (fed)
 miss_years <- expand_grid(
  year = 1789:1900, 
  state = unique(pop$state)
 ) %>% 
  filter(year %out% pop$year) %>% 
  arrange(state)

 # find year joined union
 min_years <- pop %>%
  select(-population) %>% 
  group_by(state) %>% 
  filter(year == min(year)) %>% 
  rename(min_year = year) %>% 
  ungroup() %>% 
  arrange(state)

 min_years[min_years == 1790] <- 1789

 # remove years before joined
 miss_years <- miss_years %>% 
  left_join(min_years) %>% 
  filter(year >= min_year) %>% 
  select(-min_year)

 # linear approximate missing values
 approx_pop <- function(x) {
  approxfun(seq_along(x), x, method = "linear")(seq_along(x))
 }

 # apply to states 
 pop <- pop %>% 
  bind_rows(miss_years) %>% 
  arrange(year) %>% 
  group_by(state) %>% 
  mutate(population = approx_pop(population)) %>% 
  ungroup() %>% 
  arrange(state)

 pop$population[pop$year == 1789] <- pop$population[pop$year == 1790]

 pop %>% 
  ggplot(aes(year, population)) +
  geom_line(aes(color = state)) +
  theme(legend.position = "none")

 rm(a, b, fed, miss_years, min_years)

 write_tsv(pop, file = "~/Documents/state_pop.tsv")

 # sen control hist --------------------------------------------------------

 # get a list of legislators
 usio_cl <- "https://theunitedstates.io/congress-legislators/"

 # get it from json for terms
 leg <- fromJSON(
  txt = paste0(usio_cl, "legislators-historical.json"),
  simplifyDataFrame = TRUE
 )

 # create table of names
 who <- tibble(
  wid = leg$id$wikidata,
  last = leg$name$last,
  term_id = seq_along(leg$terms)
 )

 # create table of all terms served
 terms <- leg$terms %>% 
  map_df(as_tibble, .id = "term_id") %>% 
  select(term_id, type, start, end, state, class, party) %>% 
  mutate(across(term_id, as.integer)) %>% 
  mutate(across(c(start, end), parse_date))

 # repeat legislators for every term
 leg <- who %>% 
  left_join(terms, by = "term_id") %>% 
  # keep only senators
  filter(type == "sen") %>% 
  select(-type)

 # repeat for 116 congress =================================================

 leg_116 <- fromJSON(
  txt = paste0(usio_cl, "legislators-current.json"),
  simplifyDataFrame = TRUE
 )

 who <- tibble(
  wid = leg_116$id$wikidata,
  last = leg_116$name$last,
  term_id = seq_along(leg_116$terms)
 )

 terms <- leg_116$terms %>% 
  map_df(as_tibble, .id = "term_id") %>% 
  select(term_id, type, start, end, state, class, party) %>% 
  mutate(across(term_id, as.integer)) %>% 
  mutate(across(c(start, end), parse_date))

 leg_116 <- who %>% 
  left_join(terms, by = "term_id") %>% 
  # keep only senators
  filter(type == "sen") %>% 
  select(-type)

 # add 116 to historical
 leg <- bind_rows(leg, leg_116)

 write_tsv(leg, "~/Documents/sen_member.tsv")

 rm(who, terms, leg_116)

 # find congresses ---------------------------------------------------------

 # scrape all sessions of congress
 sesh <- 
  read_html("https://w.wiki/miB") %>% 
  html_nodes(".wikitable") %>% 
  html_table() %>% 
  map(rename, `Congress began` = 2, `Congress ended` = 5) %>% 
  map_df(as_tibble) %>% 
  clean_names() %>% 
  separate(
    col = session_dates,
    into = c("session_began", "session_ended"),
    sep = "\\s–\\s"
  ) %>% 
  type_convert(
    col_types = cols(
      congress = col_number(),
      congress_began = col_date("%B %d, %Y"),
      session = col_number(),
      session_began = col_date("%B %d, %Y"),
      session_ended = col_date("%B %d, %Y"),
      congress_ended = col_date("%B %d, %Y")
    )
  )

 write_tsv(sesh, "~/Documents/con_sessions.tsv")

 # create interval columns
 sesh <- sesh %>% 
  mutate(
    congress_date = interval(congress_began, congress_ended),
    session_date = interval(session_began, session_ended),
  ) %>% 
  select(congress, congress_date, session, session_date)

 # keep only congresses
 cons <- sesh %>% 
  select(congress, interval = congress_date) %>% 
  distinct()

 # join data ---------------------------------------------------------------

 # add the congress each term started *within*
 term_con <- rep(NA, nrow(leg))
 for (i in seq_along(term_con)) {
  # use interval for senators joining mid congress
  term_con[i] <- max(which(leg$start[i] %within% cons$interval))
 }

 leg <- mutate(leg, congress = term_con)

 # calculate party control by congress
 con_party <- leg %>% 
  group_by(congress) %>% 
  count(party, sort = TRUE) %>% 
  mutate(prop = n/sum(n)) %>% 
  arrange(congress) %>% 
  mutate(majority = n == max(n), .before = n)

 # add majority party flag to terms
 leg <- left_join(leg, con_party[, 1:3])

 leg %>% 
  mutate(year = year(start)) %>% 
  group_by(year, congress, state) %>% 
  summarise(in_maj = any(majority)) %>% 
  left_join(pop) %>% 
  arrange(congress, in_maj) %>% 
  group_by(year) %>% 
  mutate(prop_pop = population/sum(population)) %>% 
  group_by(year, majority = in_maj) %>%
  summarise(prop_pop = sum(prop_pop))
	library(tidyverse)
	library(lubridate)
	library(jsonlite)
	library(janitor)
	library(rvest)

	# state pop history -------------------------------------------------------

	# look for zip export from stl fed
	# zip has 2016-2019, not on wikipedia
	fed_zip <- "~/Documents/statepop_txt.zip"
	# if doesn't exist scrape from wikipedia
	if (!file.exists(fed_zip)) {
	# scrape and read 1900-2015
	w <- read_html("https://w.wiki/mi3")
	t <- html_table(html_nodes(w, ".wikitable"))
	fed <- as_tibble(t[[5]]) %>%
	rename(year = 1) %>%
	type_convert(
	na = "n/a",
	col_types = cols(
	.default = col_number(),
	year = col_integer()
	)
	) %>%
	pivot_longer(
	cols = !year,
	names_to = "state",
	values_to = "population"
	) %>%
	filter(!is.na(population)) %>%
	relocate(state, .before = year)
	} else {
	# if zip found, extract and read 1900-2019
	fed_file <- unzip(fed_zip, files = "statepop_Annual.txt", exdir = tempdir())
	fed <- read_tsv(
	file = fed_file,
	col_types = cols(
	.default = col_double(),
	DATE = col_date()
	)
	)
	fed <- fed %>%
	mutate(year = year(DATE), .before = 1, .keep = "unused") %>%
	pivot_longer(
	cols = !year,
	names_to = "state",
	values_to = "population",
	names_pattern = "^(.*)POP$",
	values_drop_na = TRUE
	) %>%
	mutate(across(population, `*`, 1000)) %>%
	filter(state %in% state.abb)
	}

	setdiff(state.abb, fed$state) # PR
	n_distinct(fed$state) # 51
	min(fed$year) # 1900
	max(fed$year) # 2019

	# format 1790-1860 census
	a <- as_tibble(t[[1]]) %>%
	select(-2) %>%
	rename(state = 1) %>%
	type_convert(
	na = "",
	col_types = cols(
	.default = col_number(),
	state = col_character()
	)
	) %>%
	pivot_longer(
	cols = !state,
	names_to = "year",
	names_transform = list(year = as.integer),
	values_to = "population",
	) %>%
	mutate(across(state, str_remove, "\\s\\[.*\\]")) %>%
	filter(state %in% state.name) %>%
	mutate(across(state, abbrev_state))

	min(a$year) # 1790
	max(a$year) # 1860

	# format 1870-1890 census
	b <- as_tibble(t[[3]]) %>%
	rename(state = 1) %>%
	type_convert(
	na = "",
	col_types = cols(
	.default = col_number(),
	state = col_character()
	)
	) %>%
	pivot_longer(
	cols = !state,
	names_to = "year",
	names_transform = list(year = as.integer),
	values_to = "population",
	) %>%
	mutate(across(state, str_remove, "\\s\\[.*\\]")) %>%
	filter(state %in% state.name, year < min(fed$year)) %>%
	mutate(across(state, abbrev_state))

	min(b$year) # 1870
	max(b$year) # 1890

	# combine all years
	pop <- bind_rows(fed, a, b) %>%
	arrange(year, state) %>%
	filter(!is.na(population))

	# impute the years ========================================================

	# create all years until 1900 (fed)
	miss_years <- expand_grid(
	year = 1789:1900,
	state = unique(pop$state)
	) %>%
	filter(year %out% pop$year) %>%
	arrange(state)

	# find year joined union
	min_years <- pop %>%
	select(-population) %>%
	group_by(state) %>%
	filter(year == min(year)) %>%
	rename(min_year = year) %>%
	ungroup() %>%
	arrange(state)

	min_years[min_years == 1790] <- 1789

	# remove years before joined
	miss_years <- miss_years %>%
	left_join(min_years) %>%
	filter(year >= min_year) %>%
	select(-min_year)

	# linear approximate missing values
	approx_pop <- function(x) {
	approxfun(seq_along(x), x, method = "linear")(seq_along(x))
	}

	# apply to states
	pop <- pop %>%
	bind_rows(miss_years) %>%
	arrange(year) %>%
	group_by(state) %>%
	mutate(population = approx_pop(population)) %>%
	ungroup() %>%
	arrange(state)

	pop$population[pop$year == 1789] <- pop$population[pop$year == 1790]

	pop %>%
	ggplot(aes(year, population)) +
	geom_line(aes(color = state)) +
	theme(legend.position = "none")

	rm(a, b, fed, miss_years, min_years)

	write_tsv(pop, file = "~/Documents/state_pop.tsv")

	# sen control hist --------------------------------------------------------

	# get a list of legislators
	usio_cl <- "https://theunitedstates.io/congress-legislators/"

	# get it from json for terms
	leg <- fromJSON(
	txt = paste0(usio_cl, "legislators-historical.json"),
	simplifyDataFrame = TRUE
	)

	# create table of names
	who <- tibble(
	wid = leg$id$wikidata,
	last = leg$name$last,
	term_id = seq_along(leg$terms)
	)

	# create table of all terms served
	terms <- leg$terms %>%
	map_df(as_tibble, .id = "term_id") %>%
	select(term_id, type, start, end, state, class, party) %>%
	mutate(across(term_id, as.integer)) %>%
	mutate(across(c(start, end), parse_date))

	# repeat legislators for every term
	leg <- who %>%
	left_join(terms, by = "term_id") %>%
	# keep only senators
	filter(type == "sen") %>%
	select(-type)

	# repeat for 116 congress =================================================

	leg_116 <- fromJSON(
	txt = paste0(usio_cl, "legislators-current.json"),
	simplifyDataFrame = TRUE
	)

	who <- tibble(
	wid = leg_116$id$wikidata,
	last = leg_116$name$last,
	term_id = seq_along(leg_116$terms)
	)

	terms <- leg_116$terms %>%
	map_df(as_tibble, .id = "term_id") %>%
	select(term_id, type, start, end, state, class, party) %>%
	mutate(across(term_id, as.integer)) %>%
	mutate(across(c(start, end), parse_date))

	leg_116 <- who %>%
	left_join(terms, by = "term_id") %>%
	# keep only senators
	filter(type == "sen") %>%
	select(-type)

	# add 116 to historical
	leg <- bind_rows(leg, leg_116)

	write_tsv(leg, "~/Documents/sen_member.tsv")

	rm(who, terms, leg_116)

	# find congresses ---------------------------------------------------------

	# scrape all sessions of congress
	sesh <-
	read_html("https://w.wiki/miB") %>%
	html_nodes(".wikitable") %>%
	html_table() %>%
	map(rename, `Congress began` = 2, `Congress ended` = 5) %>%
	map_df(as_tibble) %>%
	clean_names() %>%
	separate(
	col = session_dates,
	into = c("session_began", "session_ended"),
	sep = "\\s–\\s"
	) %>%
	type_convert(
	col_types = cols(
	congress = col_number(),
	congress_began = col_date("%B %d, %Y"),
	session = col_number(),
	session_began = col_date("%B %d, %Y"),
	session_ended = col_date("%B %d, %Y"),
	congress_ended = col_date("%B %d, %Y")
	)
	)

	write_tsv(sesh, "~/Documents/con_sessions.tsv")

	# create interval columns
	sesh <- sesh %>%
	mutate(
	congress_date = interval(congress_began, congress_ended),
	session_date = interval(session_began, session_ended),
	) %>%
	select(congress, congress_date, session, session_date)

	# keep only congresses
	cons <- sesh %>%
	select(congress, interval = congress_date) %>%
	distinct()

	# join data ---------------------------------------------------------------

	# add the congress each term started within
	term_con <- rep(NA, nrow(leg))
	for (i in seq_along(term_con)) {
	# use interval for senators joining mid congress
	term_con[i] <- max(which(leg$start[i] %within% cons$interval))
	}

	leg <- mutate(leg, congress = term_con)

	# calculate party control by congress
	con_party <- leg %>%
	group_by(congress) %>%
	count(party, sort = TRUE) %>%
	mutate(prop = n/sum(n)) %>%
	arrange(congress) %>%
	mutate(majority = n == max(n), .before = n)

	# add majority party flag to terms
	leg <- left_join(leg, con_party[, 1:3])

	leg %>%
	mutate(year = year(start)) %>%
	group_by(year, congress, state) %>%
	summarise(in_maj = any(majority)) %>%
	left_join(pop) %>%
	arrange(congress, in_maj) %>%
	group_by(year) %>%
	mutate(prop_pop = population/sum(population)) %>%
	group_by(year, majority = in_maj) %>%
	summarise(prop_pop = sum(prop_pop))