Created
March 11, 2020 19:36
-
-
Save hepplerj/a2c62410125c0b334e12eb8b449be50d to your computer and use it in GitHub Desktop.
An example script for census data in R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(tidycensus) | |
# My recommendation is to use the tidycensus library to make getting this data | |
# easier than reading in the data from the Census website. | |
# | |
# Before you can begin, you'll need to get an API key from the Census Bureau. | |
# You can acquire one here: | |
# | |
# Once you have the API key, run the following in RStudio: | |
# usethis::edit_r_environ() | |
# | |
# This will open your .Renviron file. Here, you'll add the following line, replacing | |
# YOUR_API_KEY with the key sent to you by the Census Bureau: | |
# CENSUS_API_KEY = YOUR_API_KEY | |
# | |
# Restart R for the changes to take effect. | |
# Finding Census Data ----------------------------------------------------- | |
# First, we'll need to know the variable ID from the Census or ACS -- since there | |
# are thousands of these IDs accross different Census files, we'll use the | |
# load_variables function to find the information. It takes two arguments: | |
# 1. the year, and 2. the dataset. | |
variables <- load_variables(2018, "acs5") | |
# We'll now open up the data frame and look for the button in the upper left | |
# called 'Filter.' From here, start typing "attainment" to find the set of | |
# variables related to that data. Here we can see the list of IDs that are | |
# associated with that dataset. | |
variables %>% View() | |
# We'll use that list of IDs to build a data frame. | |
attainment <- get_acs(geography = "us", | |
variables = c("B15001_001", | |
"B15001_002", | |
"B15001_003", | |
"B15001_004", | |
"B15001_005", | |
"B15001_006", | |
"B15001_007", | |
"B15001_008", | |
"B15001_009", | |
"B15001_010", | |
"B15001_011", | |
"B15001_012", | |
"B15001_013", | |
"B15001_014", | |
"B15001_015", | |
"B15001_016", | |
"B15001_017", | |
"B15001_018", | |
"B15001_019", | |
"B15001_020", | |
"B15001_021", | |
"B15001_022", | |
"B15001_023", | |
"B15001_024", | |
"B15001_025", | |
"B15001_026", | |
"B15001_027", | |
"B15001_028", | |
"B15001_029", | |
"B15001_030", | |
"B15001_031", | |
"B15001_032", | |
"B15001_033", | |
"B15001_034", | |
"B15001_035", | |
"B15001_036", | |
"B15001_037", | |
"B15001_038", | |
"B15001_039", | |
"B15001_040", | |
"B15001_041", | |
"B15001_042", | |
"B15001_043", | |
"B15001_044", | |
"B15001_045", | |
"B15001_046", | |
"B15001_047", | |
"B15001_048", | |
"B15001_049", | |
"B15001_050", | |
"B15001_051", | |
"B15001_052", | |
"B15001_053", | |
"B15001_054", | |
"B15001_055", | |
"B15001_056", | |
"B15001_057", | |
"B15001_058", | |
"B15001_059", | |
"B15001_060", | |
"B15001_061", | |
"B15001_062", | |
"B15001_063", | |
"B15001_064", | |
"B15001_065", | |
"B15001_066", | |
"B15001_067", | |
"B15001_068", | |
"B15001_069", | |
"B15001_070", | |
"B15001_071", | |
"B15001_072", | |
"B15001_073", | |
"B15001_074", | |
"B15001_075", | |
"B15001_076", | |
"B15001_077", | |
"B15001_078", | |
"B15001_079", | |
"B15001_080", | |
"B15001_081", | |
"B15001_082", | |
"B15001_083"), | |
year = 2018) | |
# Now we can do things like chart the data. For example: | |
ggplot(attainment, aes(variable, estimate)) + | |
geom_bar(stat="identity") | |
# Using your export ------------------------------------------------------ | |
# Just as a note: the data set you sent me would take quite a bit of work to get | |
# it into a tidy data format. As a starting point, you could do something like | |
# the code below (but I can't do the work for you): | |
# First, we read in our data. We use the skip argument to tell read_csv to | |
# ignore the first three rows of the spreadsheet. | |
data <- read_csv("export (3).csv", skip = 3) | |
# Then, we use the names() function to manually rename our columns. | |
names(data) <- c("age","type","demographic","completed","total_estimate","total_moe","percent_estimate","percent_moe","male_estimate","male_estimate_moe","male_percent","male_percent_moe","female_estimate","female_estimate_moe","female_percent","female_percent_moe") | |
# Finally, we use gather() to get our data into a tidy format. As an example: | |
data2 <- data %>% | |
select(age, demographic, total_estimate, male_estimate, female_estimate) %>% | |
gather(estimate_type, estimate_value, total_estimate, male_estimate, female_estimate) | |
# Now we could chart things, for example: | |
ggplot(data2, aes(demographic, estimate_value)) + | |
geom_bar(stat="identity") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment