Last active
June 7, 2023 19:04
-
-
Save knbknb/f453a083cc2f2f55cd3b85b498f77ec3 to your computer and use it in GitHub Desktop.
R: scrape something from wikipedia
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(lubridate) | |
library(rvest) | |
# Even better to use the Internet Archive since web pages change over time | |
url <- "https://web.archive.org/web/20220908211042/https://en.wikipedia.org/wiki/..." | |
wiki_raw <- read_html(url) | |
wiki_raw | |
wiki_extracted <- wiki_raw %>% | |
html_nodes(xpath = "//table[2]") %>% | |
html_table() %>% | |
bind_rows() # unlist | |
wiki_extracted |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# just returns the error messages | |
url_tester <- function(url_list, type = c("result", "error")){ | |
type <- match.arg(type) | |
url_list %>% | |
# Create a safely() version of read_lines() | |
map( safely(read_lines) ) %>% | |
set_names( url_list ) %>% | |
# Transpose into a list of $result and $error | |
transpose() %>% | |
pluck(type) | |
} | |
Try this function on an urls vector, return only the bad ones: | |
# Try this function on the urls object | |
url_tester(urls, type = "error") %>% compact() | |
# variant: return only the status_code | |
# use GET and possibly() to return NULL instead of an error, | |
# return only the status_code | |
url_tester <- function(url_list){ | |
url_list %>% | |
# Map a version of HEAD() that would otherwise return NULL | |
map(possibly(GET, otherwise = NULL) ) %>% | |
# impure intermediate step: | |
set_names( url_list ) %>% | |
# Remove the NULLs | |
compact() %>% | |
# Extract all the "status_code" elements | |
map("status_code") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment