hepplerj · March 11, 2020 19:36
diff --git a/census_cleanup.R b/census_cleanup.R
 library(tidyverse)
 library(tidycensus)

 # My recommendation is to use the tidycensus library to make getting this data
 # easier than reading in the data from the Census website. 
 #
 # Before you can begin, you'll need to get an API key from the Census Bureau.
 # You can acquire one here: 
 #
 # Once you have the API key, run the following in RStudio: 
 # usethis::edit_r_environ()
 #
 # This will open your .Renviron file. Here, you'll add the following line, replacing
 # YOUR_API_KEY with the key sent to you by the Census Bureau: 
 # CENSUS_API_KEY = YOUR_API_KEY
 #
 # Restart R for the changes to take effect.

 # Finding Census Data -----------------------------------------------------

 # First, we'll need to know the variable ID from the Census or ACS -- since there
 # are thousands of these IDs accross different Census files, we'll use the 
 # load_variables function to find the information. It takes two arguments: 
 # 1. the year,  and 2. the dataset.

 variables <- load_variables(2018, "acs5")

 # We'll now open up the data frame and look for the button in the upper left
 # called 'Filter.' From here, start typing "attainment" to find the set of
 # variables related to that data. Here we can see the list of IDs that are
 # associated with that dataset.
 variables %>% View()

 # We'll use that list of IDs to build a data frame.
 attainment <- get_acs(geography = "us",
                      variables = c("B15001_001",
                                    "B15001_002",
                                    "B15001_003",
                                    "B15001_004",
                                    "B15001_005",
                                    "B15001_006",
                                    "B15001_007",
                                    "B15001_008",
                                    "B15001_009",
                                    "B15001_010",
                                    "B15001_011",
                                    "B15001_012",
                                    "B15001_013",
                                    "B15001_014",
                                    "B15001_015",
                                    "B15001_016",
                                    "B15001_017",
                                    "B15001_018",
                                    "B15001_019",
                                    "B15001_020",
                                    "B15001_021",
                                    "B15001_022",
                                    "B15001_023",
                                    "B15001_024",
                                    "B15001_025",
                                    "B15001_026",
                                    "B15001_027",
                                    "B15001_028",
                                    "B15001_029",
                                    "B15001_030",
                                    "B15001_031",
                                    "B15001_032",
                                    "B15001_033",
                                    "B15001_034",
                                    "B15001_035",
                                    "B15001_036",
                                    "B15001_037",
                                    "B15001_038",
                                    "B15001_039",
                                    "B15001_040",
                                    "B15001_041",
                                    "B15001_042",
                                    "B15001_043",
                                    "B15001_044",
                                    "B15001_045",
                                    "B15001_046",
                                    "B15001_047",
                                    "B15001_048",
                                    "B15001_049",
                                    "B15001_050",
                                    "B15001_051",
                                    "B15001_052",
                                    "B15001_053",
                                    "B15001_054",
                                    "B15001_055",
                                    "B15001_056",
                                    "B15001_057",
                                    "B15001_058",
                                    "B15001_059",
                                    "B15001_060",
                                    "B15001_061",
                                    "B15001_062",
                                    "B15001_063",
                                    "B15001_064",
                                    "B15001_065",
                                    "B15001_066",
                                    "B15001_067",
                                    "B15001_068",
                                    "B15001_069",
                                    "B15001_070",
                                    "B15001_071",
                                    "B15001_072",
                                    "B15001_073",
                                    "B15001_074",
                                    "B15001_075",
                                    "B15001_076",
                                    "B15001_077",
                                    "B15001_078",
                                    "B15001_079",
                                    "B15001_080",
                                    "B15001_081",
                                    "B15001_082",
                                    "B15001_083"),
                      year = 2018)

 # Now we can do things like chart the data. For example:
 ggplot(attainment, aes(variable, estimate)) +
  geom_bar(stat="identity")

 # Using your export ------------------------------------------------------

 # Just as a note: the data set you sent me would take quite a bit of work to get
 # it into a tidy data format. As a starting point, you could do something like
 # the code below (but I can't do the work for you): 

 # First, we read in our data. We use the skip argument to tell read_csv to 
 # ignore the first three rows of the spreadsheet. 
 data <- read_csv("export (3).csv", skip = 3)

 # Then, we use the names() function to manually rename our columns. 
 names(data) <- c("age","type","demographic","completed","total_estimate","total_moe","percent_estimate","percent_moe","male_estimate","male_estimate_moe","male_percent","male_percent_moe","female_estimate","female_estimate_moe","female_percent","female_percent_moe")

 # Finally, we use gather() to get our data into a tidy format. As an example:  
 data2 <- data %>% 
  select(age, demographic, total_estimate, male_estimate, female_estimate) %>% 
  gather(estimate_type, estimate_value, total_estimate, male_estimate, female_estimate)

 # Now we could chart things, for example:
 ggplot(data2, aes(demographic, estimate_value)) +
  geom_bar(stat="identity")
	library(tidyverse)
	library(tidycensus)

	# My recommendation is to use the tidycensus library to make getting this data
	# easier than reading in the data from the Census website.
	#
	# Before you can begin, you'll need to get an API key from the Census Bureau.
	# You can acquire one here:
	#
	# Once you have the API key, run the following in RStudio:
	# usethis::edit_r_environ()
	#
	# This will open your .Renviron file. Here, you'll add the following line, replacing
	# YOUR_API_KEY with the key sent to you by the Census Bureau:
	# CENSUS_API_KEY = YOUR_API_KEY
	#
	# Restart R for the changes to take effect.

	# Finding Census Data -----------------------------------------------------

	# First, we'll need to know the variable ID from the Census or ACS -- since there
	# are thousands of these IDs accross different Census files, we'll use the
	# load_variables function to find the information. It takes two arguments:
	# 1. the year, and 2. the dataset.

	variables <- load_variables(2018, "acs5")

	# We'll now open up the data frame and look for the button in the upper left
	# called 'Filter.' From here, start typing "attainment" to find the set of
	# variables related to that data. Here we can see the list of IDs that are
	# associated with that dataset.
	variables %>% View()

	# We'll use that list of IDs to build a data frame.
	attainment <- get_acs(geography = "us",
	variables = c("B15001_001",
	"B15001_002",
	"B15001_003",
	"B15001_004",
	"B15001_005",
	"B15001_006",
	"B15001_007",
	"B15001_008",
	"B15001_009",
	"B15001_010",
	"B15001_011",
	"B15001_012",
	"B15001_013",
	"B15001_014",
	"B15001_015",
	"B15001_016",
	"B15001_017",
	"B15001_018",
	"B15001_019",
	"B15001_020",
	"B15001_021",
	"B15001_022",
	"B15001_023",
	"B15001_024",
	"B15001_025",
	"B15001_026",
	"B15001_027",
	"B15001_028",
	"B15001_029",
	"B15001_030",
	"B15001_031",
	"B15001_032",
	"B15001_033",
	"B15001_034",
	"B15001_035",
	"B15001_036",
	"B15001_037",
	"B15001_038",
	"B15001_039",
	"B15001_040",
	"B15001_041",
	"B15001_042",
	"B15001_043",
	"B15001_044",
	"B15001_045",
	"B15001_046",
	"B15001_047",
	"B15001_048",
	"B15001_049",
	"B15001_050",
	"B15001_051",
	"B15001_052",
	"B15001_053",
	"B15001_054",
	"B15001_055",
	"B15001_056",
	"B15001_057",
	"B15001_058",
	"B15001_059",
	"B15001_060",
	"B15001_061",
	"B15001_062",
	"B15001_063",
	"B15001_064",
	"B15001_065",
	"B15001_066",
	"B15001_067",
	"B15001_068",
	"B15001_069",
	"B15001_070",
	"B15001_071",
	"B15001_072",
	"B15001_073",
	"B15001_074",
	"B15001_075",
	"B15001_076",
	"B15001_077",
	"B15001_078",
	"B15001_079",
	"B15001_080",
	"B15001_081",
	"B15001_082",
	"B15001_083"),
	year = 2018)

	# Now we can do things like chart the data. For example:
	ggplot(attainment, aes(variable, estimate)) +
	geom_bar(stat="identity")

	# Using your export ------------------------------------------------------

	# Just as a note: the data set you sent me would take quite a bit of work to get
	# it into a tidy data format. As a starting point, you could do something like
	# the code below (but I can't do the work for you):

	# First, we read in our data. We use the skip argument to tell read_csv to
	# ignore the first three rows of the spreadsheet.
	data <- read_csv("export (3).csv", skip = 3)

	# Then, we use the names() function to manually rename our columns.
	names(data) <- c("age","type","demographic","completed","total_estimate","total_moe","percent_estimate","percent_moe","male_estimate","male_estimate_moe","male_percent","male_percent_moe","female_estimate","female_estimate_moe","female_percent","female_percent_moe")

	# Finally, we use gather() to get our data into a tidy format. As an example:
	data2 <- data %>%
	select(age, demographic, total_estimate, male_estimate, female_estimate) %>%
	gather(estimate_type, estimate_value, total_estimate, male_estimate, female_estimate)

	# Now we could chart things, for example:
	ggplot(data2, aes(demographic, estimate_value)) +
	geom_bar(stat="identity")