Created
April 29, 2015 09:51
-
-
Save jlehtoma/2b5d92af69e9148a863c to your computer and use it in GitHub Desktop.
Scrape bird transect lines urls
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("dplyr") | |
library("magrittr") | |
library("rvest") | |
urls <- c("http://koivu.luomus.fi/seurannat/linjalaskenta/vakiolinjat.php") | |
site <- html(urls) | |
linjat_table <- site %>% | |
html_node("table") %>% | |
html_nodes("tr") | |
extract_urls <- function(x) { | |
# Create an empty list to hold the results | |
transect_data <- list() | |
# Get all table data nodes from a single row | |
td_nodes <- html_nodes(x, "td") | |
# Extract the transect ID | |
transect_data[["id"]] <- html_text(td_nodes[[1]]) | |
# Extract kansalaisen karttapaikka link | |
transect_data[["map_url"]] <- html_attr(html_node(td_nodes[[13]], "a"), | |
"href") | |
# Extract kartta pdf link | |
transect_data[["map_pdf_url"]] <- html_attr(html_node(td_nodes[[15]], "a"), | |
"href") | |
# Extract form pdf link | |
transect_data[["form_pdf_url"]] <- html_attr(html_node(td_nodes[[16]], "a"), | |
"href") | |
# Coerce the list to data frame | |
return(dplyr::as_data_frame(transect_data)) | |
} | |
# Skip the first row (it's the header) | |
transect_data <- lapply(linjat_table[2:length(linjat_table)], | |
function(x) {return(extract_urls(x))}) | |
# Create a data frame | |
transect_data <- dplyr::bind_rows(transect_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment