Last active
August 20, 2021 12:10
-
-
Save hyginn/c15e697223c45b5e16b3e90300807e45 to your computer and use it in GitHub Desktop.
R script to check an HTML document for broken links. Save, source and try the example at the bottom.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# checkLinks.R | |
# Author: Boris Steipe (ORCID: 0000-0002-1134-6758) | |
# License: (c) Author (2018) + MIT | |
# Date: 2018-12-29 | |
# | |
# This R-script installs packages (if needed) and defines a function to access | |
# all http:// and https:// links referenced in an HTML document and check their | |
# status code. | |
# | |
# Issues: | |
# - relative URLs are not expanded with an inferred base-URL but discarded | |
# - calls to URLs that don't resolve throw an error. Might use tryCatch() | |
# on httr::GET() | |
# | |
if (! requireNamespace("xml2", quietly = TRUE)) { | |
install.packages("xml2") | |
} | |
if (! requireNamespace("rvest", quietly = TRUE)) { | |
install.packages("rvest") | |
} | |
if (! requireNamespace("httr", quietly = TRUE)) { | |
install.packages("httr") | |
} | |
checkLinks <- function(URL) { | |
# download page and access URLs | |
myPage <- xml2::read_html(URL) | |
URLs <- rvest::html_attr(rvest::html_nodes(myPage, "a"), "href") | |
# remove URLs that don't begin with http:// or https:// | |
sel <- grepl("^https?://", URLs) | |
URLs <- URLs[sel] | |
# remove anchor parts (from "#" to end), since absence of internal | |
# anchors does not affect the status code | |
URLs <- gsub("#[^#]*$", "", URLs) | |
# remove duplicates | |
URLs <- unique(URLs) | |
stati <- matrix(c("category", "reason", "URL"), ncol = 3) | |
for (thisURL in URLs) { | |
x <- httr::GET(thisURL) | |
stati <- rbind(stati, c(httr::http_status(x)$category, | |
httr::http_status(x)$reason, | |
thisURL)) | |
} | |
myOrd <- order(stati[ -1, 1]) | |
stati <- rbind(stati[1, ], (stati[ -1, ])[myOrd, ]) | |
return(stati) | |
} | |
# try it | |
if (FALSE) { | |
# Use test page URL: | |
URL <- "http://steipe.biochemistry.utoronto.ca/abc/assets/testCheckLinks.html" | |
checkLinks(URL) | |
} | |
# [END] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @hyginn thank yo so much for creating this code!
Do you know what can I do if I get this type of error when running your function?
Error in curl::curl_fetch_memory(url, handle = handle) : Could not resolve host: revistacardus.com