Skip to content

Instantly share code, notes, and snippets.

@daranzolin
Created December 29, 2018 22:40
Show Gist options
  • Select an option

  • Save daranzolin/fe606a2584216e1fe314b4169abada93 to your computer and use it in GitHub Desktop.

Select an option

Save daranzolin/fe606a2584216e1fe314b4169abada93 to your computer and use it in GitHub Desktop.
library(quantarcticR)
library(rvest)
library(tidyverse)
library(stringr)
## Get paths from quantarcticR datasets
quantarcticR_datasets <- qa_datasets()
unique_file_paths <- quantarcticR_datasets$datasource %>%
map(function(x) {
x <- sapply(strsplit(x, "quantarcticR-cache/"), `[[`, 2)
x <- lapply(strsplit(x, "/"), `[`, c(2:3))
x
}) %>%
unique() %>%
discard(function(x) {
grepl("\\.", x)
}) %>%
flatten() %>%
map_chr(paste, collapse = "/")
## Build pages
BASE_URL <- "http://data.pgc.umn.edu/gis/packages/quantarctica/Quantarctica3/"
paths <- paste0(BASE_URL, unique_file_paths)
scrape_page <- function(path) {
qhtml <- read_html(path)
file_names <- qhtml %>%
html_nodes(".indexcolname") %>%
html_text() %>%
.[-c(1:2)]
file_sizes <- qhtml %>%
html_nodes(".indexcolsize") %>%
html_text() %>%
.[-c(1:2)]
tibble(name = file_names,
size = file_sizes)
}
safe_scrape <- safely(scrape_page)
qdata <- map(paths, safe_scrape)
results <- map(qdata, "result")
results_df <- bind_rows(results)
filter(results_df, name %in% basename(ds$datasource)) %>% nrow()
#filter(results_df, name %in% basename(ds$datasource)) %>% View()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment