jbryer · February 13, 2025 12:58
diff --git a/R b/R
 library(xml2)
 library(rvest)

 out_dir <- '~/Downloads/IPEDS/'

 years <- 2023:1980

 ipeds_base <- 'https://nces.ed.gov/ipeds/datacenter/'
 ipeds_url <- 'https://nces.ed.gov/ipeds/datacenter/DataFiles.aspx?year='

 error_links <- c() # Save any links that could not be downloaded.
 for(year in years) {
 	cat(paste0('Downloading year ', year, '...\n'))
 	dir.create(paste0(out_dir, '/', year), showWarnings = FALSE, recursive = TRUE)

 	page <- read_html(paste0(ipeds_url, year))
 	tables <- page |> html_nodes("table") |> html_table(convert = FALSE)
 	# Guessing the one with the most rows is the one we want to keep as the index
 	tab_index <- lapply(tables, nrow) |> unlist() |> which.max()
 	write.csv(tables[[tab_index]],
 			  file = paste0(out_dir, year, '/_TOC_', year, '.csv'),
 			  row.names = FALSE)

 	links <- html_attr(html_nodes(page, "a"), "href")
 	zip_files <- links[grep("*.zip", links)]
 	for(i in zip_files) {
 		dest <- paste0(out_dir, '/', year, '/', basename(i))
 		if(!file.exists(dest)) {
 			cat(paste0('Downloading ', basename(i), '...\n'))
 			tryCatch({
 				download.file(url = paste0(ipeds_base, i), dest = dest)
 			}, error = function(e) {
 				error_links <<- c(error_links, paste0(ipeds_base, i))
 				print(e)
 			})
 		}
 	}
 }
 error_links
	library(xml2)
	library(rvest)

	out_dir <- '~/Downloads/IPEDS/'

	years <- 2023:1980

	ipeds_base <- 'https://nces.ed.gov/ipeds/datacenter/'
	ipeds_url <- 'https://nces.ed.gov/ipeds/datacenter/DataFiles.aspx?year='

	error_links <- c() # Save any links that could not be downloaded.
	for(year in years) {
	cat(paste0('Downloading year ', year, '...\n'))
	dir.create(paste0(out_dir, '/', year), showWarnings = FALSE, recursive = TRUE)

	page <- read_html(paste0(ipeds_url, year))
	tables <- page \|> html_nodes("table") \|> html_table(convert = FALSE)
	# Guessing the one with the most rows is the one we want to keep as the index
	tab_index <- lapply(tables, nrow) \|> unlist() \|> which.max()
	write.csv(tables[[tab_index]],
	file = paste0(out_dir, year, '/_TOC_', year, '.csv'),
	row.names = FALSE)

	links <- html_attr(html_nodes(page, "a"), "href")
	zip_files <- links[grep("*.zip", links)]
	for(i in zip_files) {
	dest <- paste0(out_dir, '/', year, '/', basename(i))
	if(!file.exists(dest)) {
	cat(paste0('Downloading ', basename(i), '...\n'))
	tryCatch({
	download.file(url = paste0(ipeds_base, i), dest = dest)
	}, error = function(e) {
	error_links <<- c(error_links, paste0(ipeds_base, i))
	print(e)
	})
	}
	}
	}
	error_links