Last active
May 2, 2023 06:42
-
-
Save markdanese/112c3ccb0f98bd640d24 to your computer and use it in GitHub Desktop.
Scrape NHANES website and generate listing of all data (.xpt) and documentation (.htm) files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(magrittr) | |
library(rvest) | |
library(xml2) | |
get_nhanes_listing <- function(){ | |
nhanes_url <- "http://wwwn.cdc.gov/Nchs/Nhanes/Search/DataPage.aspx" | |
tbl <- xml2::read_html(nhanes_url) | |
table_text <- | |
rvest::html_table(tbl) %>% | |
data.frame(stringsAsFactors = FALSE) # just gets table, not hyperlinks in table | |
names(table_text) <- gsub("\\.", "_", names(table_text)) %>% tolower() | |
table_text <- table_text[table_text$data_file != "RDC Only",] | |
table_text$key <- gsub(" Doc", "", table_text$doc_file) %>% tolower() | |
cell_urls <- | |
rvest::html_nodes(tbl, "#PageContents_GridView1 a") %>% | |
rvest::html_attr("href") | |
documentation <- | |
cell_urls[grepl("htm$", cell_urls)] %>% | |
data.frame(doc_link = ., stringsAsFactors = FALSE) | |
documentation$key <- | |
basename(documentation$doc_link) %>% | |
gsub(".htm", "", .) %>% | |
tolower() | |
download_url <- | |
cell_urls[grepl("(XPT|xpt)$", cell_urls)] %>% | |
data.frame(data_link = ., stringsAsFactors = FALSE) | |
download_url$key <- | |
basename(download_url$data_link) %>% | |
gsub("(.XPT|.xpt)", "", .) %>% | |
tolower() | |
url_list <- merge(download_url, documentation, all.x = TRUE) | |
nhanes_file <- merge(table_text, url_list) | |
return(nhanes_file) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
updated 4 October 2015 to make sure it works with updated rvest and xml2. And to fix bugs in the original.