Created
March 8, 2024 14:36
-
-
Save PietrH/b737b94812a75564af0d67684c01f7b8 to your computer and use it in GitHub Desktop.
Check and mine the links in a just the docs documentation website
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Test all links on a webpage | |
## GOAL 1 : list all links | |
## GOAL 2: test all links | |
# for the anatomy of a url, I referred to | |
# https://www.netmeister.org/blog/urls.html | |
# load libraries ---------------------------------------------------------- | |
library(rvest) | |
library(httr2) | |
library(purrr) | |
library(dplyr) | |
library(cli) | |
# set domain to test ------------------------------------------------------ | |
home_url <- "https://docs.b-cubed.eu/dev-guide/" | |
hostname <- "https://docs.b-cubed.eu" | |
get_links <- function(url){ | |
rvest::read_html(url) %>% | |
rvest::html_nodes("a") %>% | |
rvest::html_attr("href") | |
} | |
home_links <- get_links(home_url) | |
# internal links ---------------------------------------------------------- | |
# should be prefixed with the hostname | |
prefix_with_hostname <- | |
function(pathname, hostname = "https://docs.b-cubed.eu") { | |
paste0(hostname, pathname) | |
} | |
## all pages of the dev guide --------------------------------------------- | |
pages_to_test <- | |
home_links[stringr::str_starts(home_links, stringr::fixed("/dev-guide/"))] %>% | |
prefix_with_hostname() | |
# check if url resolves --------------------------------------------------- | |
# Return error when url does not resolve | |
check_url <- function(url) { | |
httr2::request(url) %>% | |
httr2::req_retry(max_tries = 3, max_seconds = 2) %>% | |
httr2::req_user_agent(string = "r_link_checker") %>% | |
httr2::req_perform() %>% | |
httr2::resp_check_status() | |
} | |
# get links of the dev guide ---------------------------------------------- | |
all_links <- | |
purrr::map(pages_to_test, get_links) %>% | |
purrr::set_names(pages_to_test) | |
# if it starts with # it's a link within the same path, if it starts with / it's | |
# a link starting from the hostname | |
prefix_with_page_url <- function(pathname, page_url){ | |
paste0(page_url,pathname) | |
} | |
# convert to a tibble with a column for url, is_heading for #, is_internal_link | |
# for internal links, and is_external_link for external links | |
identify_links <- function(links_vector) { | |
dplyr::tibble( | |
url = links_vector, | |
is_heading = stringr::str_starts(url, stringr::fixed("#")), | |
is_interal_link = stringr::str_starts(url, stringr::fixed("/")), | |
is_external_link = !(is_heading | is_interal_link) | |
) | |
} | |
# extract the external links only | |
external_links <- | |
purrr::map(all_links, identify_links) %>% | |
purrr::map_dfr(~dplyr::filter(.x, is_external_link)) %>% | |
dplyr::distinct(url) %>% | |
dplyr::pull(url) | |
# test the external links ------------------------------------------------- | |
external_links_test_result <- | |
purrr::map(external_links, purrr::safely(check_url), | |
.progress = TRUE) %>% | |
purrr::set_names(external_links) | |
external_links_broken <- | |
purrr::keep(external_links_test_result, | |
function(x) { | |
!is.null(purrr::chuck(x, "error")) | |
}) | |
# print a little report --------------------------------------------------- | |
cli::cli_h2("The following external links are broken:") | |
cli::cli_li(items = names(external_links_broken)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment