Skip to content

Instantly share code, notes, and snippets.

@jbryer
Created February 13, 2025 12:58
Show Gist options
  • Save jbryer/c7bf566411758c8d33d65c204e4da136 to your computer and use it in GitHub Desktop.
Save jbryer/c7bf566411758c8d33d65c204e4da136 to your computer and use it in GitHub Desktop.
Download IPEDS Data
library(xml2)
library(rvest)
out_dir <- '~/Downloads/IPEDS/'
years <- 2023:1980
ipeds_base <- 'https://nces.ed.gov/ipeds/datacenter/'
ipeds_url <- 'https://nces.ed.gov/ipeds/datacenter/DataFiles.aspx?year='
error_links <- c() # Save any links that could not be downloaded.
for(year in years) {
cat(paste0('Downloading year ', year, '...\n'))
dir.create(paste0(out_dir, '/', year), showWarnings = FALSE, recursive = TRUE)
page <- read_html(paste0(ipeds_url, year))
tables <- page |> html_nodes("table") |> html_table(convert = FALSE)
# Guessing the one with the most rows is the one we want to keep as the index
tab_index <- lapply(tables, nrow) |> unlist() |> which.max()
write.csv(tables[[tab_index]],
file = paste0(out_dir, year, '/_TOC_', year, '.csv'),
row.names = FALSE)
links <- html_attr(html_nodes(page, "a"), "href")
zip_files <- links[grep("*.zip", links)]
for(i in zip_files) {
dest <- paste0(out_dir, '/', year, '/', basename(i))
if(!file.exists(dest)) {
cat(paste0('Downloading ', basename(i), '...\n'))
tryCatch({
download.file(url = paste0(ipeds_base, i), dest = dest)
}, error = function(e) {
error_links <<- c(error_links, paste0(ipeds_base, i))
print(e)
})
}
}
}
error_links
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment