Skip to content

Instantly share code, notes, and snippets.

@thoughtfulbloke
Created April 11, 2021 04:42
Show Gist options
  • Save thoughtfulbloke/f1b275e588e1ac7f1a2bfdd20a5c8a7e to your computer and use it in GitHub Desktop.
Save thoughtfulbloke/f1b275e588e1ac7f1a2bfdd20a5c8a7e to your computer and use it in GitHub Desktop.
# works through
# https://www.health.govt.nz/news-media/media-releases?page=1 and subsequent pages
library(rvest)
library(dplyr)
library(stringr)
table_fetch <- function(x){
url_loc <- paste0("https://www.health.govt.nz",x)
page_obj <- url_loc %>% read_html()
table_objs <- page_obj %>% html_nodes("table")
update_date <- page_obj %>%
html_nodes("span.date-display-single") %>%
html_attr("content")
# recent tableformat reports have the cases table and a footer table
# so assumng 2 tables with cases being table 1
if(length(table_objs) < 2) {
return(data.frame(page_url = x,
update_datetime = update_date,
stringsAsFactors = FALSE))
}
tbl <- (table_objs %>% html_table(fill = TRUE))[[1]]
tbl$page_url <- x
tbl$update_datetime = update_date
return(tbl)
}
getlinks <- function(x){
webpage <- paste0("https://www.health.govt.nz/news-media/media-releases?page=",x)
links <- unique(webpage %>% read_html() %>% html_nodes("a") %>% html_attr("href"))
links <- links[str_starts(links, fixed("/news-media/media-releases/")) & !is.na(links)]
return(links)
}
pages_back_to_go <- 15
all_links <- lapply(1:pages_back_to_go, getlinks) %>% unlist() %>% unique()
all_table_1s <- lapply(all_links, table_fetch)
aggregate_tables <- bind_rows(all_table_1s)
write.csv(aggregate_tables, file="MoHcheck_by_hand.csv", row.names=FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment