Skip to content

Instantly share code, notes, and snippets.

@expersso
Last active April 18, 2024 19:01
Show Gist options
  • Save expersso/bb03efcab2a6c125da5ac22e1c33d070 to your computer and use it in GitHub Desktop.
Save expersso/bb03efcab2a6c125da5ac22e1c33d070 to your computer and use it in GitHub Desktop.
Extracting HTML tables and arbitrary HTML attributes
# SPECIFIC PROBLEM: Extract an html table and include all links
# (even when there are no links or multiple links per cell)
#
# GENERAL PROBLEM: Extract an html table and include all attributes
# for any given tag and attribute
#
# Example usage at bottom of the page
#
# Motivation: https://twitter.com/daattali/status/717582654476914688
library(rvest)
library(purrr)
#For a given tag, attribute and table node, extracts the attribute in entry i, j
get_attr <- function(i, j, tbl_node, tag, attr) {
xpath <- sprintf("//tr[position() = %s]//*[position() = %s]//%s", i, j, tag)
tbl_node %>% xml_find_all(paste0(xml_path(.), xpath)) %>% html_attr(attr)
}
# Given a DF and a function, returns a function that applies the given function
# to every element in the DF by column-major order
iterating_over <- function(df, f) {
j <- 1:ncol(df)
i <- 1:nrow(df)
function(...) {
map(j, function(j) {
map(i, function(i) {
f(i, j, ...)
})
})
}
}
# Given a DF, creates vector of names of the form
# c("var1", "var1_suffix", "var2", "var2_suffix", ...)
make_names <- function(df, suffix, sep = "_") {
nm <- names(df)
nm_suffix <- paste(names(df), suffix, sep = sep)
list(nm, nm_suffix) %>%
transpose() %>%
flatten() %>%
flatten_chr()
}
# Given a DF, drops all columns where all entries are empty
drop_empty_cols <- function(df) {
no_links_idx <- map_lgl(df, ~all(map_lgl(.x, is_empty)))
df[, !no_links_idx]
}
# Given a list of lists, zips them together into a single data frame
zip_dfs <- . %>%
transpose() %>%
map(function(col) map(col, cbind)) %>%
map(as.data.frame, stringsAsFactors = FALSE) %>%
reduce(cbind)
# Given a table node, tag, and attribute, creates a dataframe with entries and HTML attributes
extract_tbl_w_attr <- function(tbl_node, tag, attr, ...) {
tbl <- html_table(tbl_node, ...)
links <- iterating_over(tbl, get_attr)(tbl_node = tbl_node, tag = tag, attr = attr)
list(tbl, links) %>%
zip_dfs() %>%
set_names(make_names(tbl, suffix = attr)) %>%
drop_empty_cols()
}
# IO section
df <- "https://github.com/daattali/addinslist/blob/master/README.md#list-of-addins" %>%
read_html() %>%
xml_find_one("//table") %>%
extract_tbl_w_attr(tag = "a", attr = "href")
dplyr::glimpse(df)
# Observations: 28
# Variables: 10
# $ Name (chr) "Browse RStudio addins", "Colour picker", "ggplot2 Marginal Plots", "ggplot Theme Ass...
# $ Description (chr) "Browse and install RStudio addins", "Lets you easily select colours", "Add marginal ...
# $ Package (chr) "addinslist", "shinyjs", "ggExtra", "ggThemeAssist", "taskscheduleR", "jadd", "WrapRm...
# $ Package_href (list) https://github.com/daattali/addinslist, https://github.com/daattali/shinyjs, https:/...
# $ On CRAN? (chr) "\u2705", "\u2705", "\u2705", "\u2705", "\u274c", "\u274c", "\u274c", "\u274c", "\u27...
# $ Author (chr) "Dean Attali", "Dean Attali", "Dean Attali", "Calli Gross", "jwijffels", "Jenny Bryan...
# $ Author_href (list) http://deanattali.com/, http://deanattali.com/, http://deanattali.com/, https://gith...
# $ More links (chr) "Screenshot", "Screenshot, Demo video", "Screenshot", "Demo GIF", "Screenshot", "Demo...
# $ More links_href (list) https://raw.githubusercontent.com/daattali/addinslist/master/inst/media/addin.png, h...
# $ Notes (chr) "", "", "", "", "", "", "", "", "", "Doesn't work on Windows", "", "", "", "", "", ""...
# Also works for e.g. the "alias" attribute of the "g-emoji" tags
emojis <- "https://github.com/daattali/addinslist/blob/master/README.md#list-of-addins" %>%
read_html() %>%
xml_find_one("//table") %>%
extract_tbl_w_attr("g-emoji", "alias")
dplyr::glimpse(emojis)
# Observations: 28
# Variables: 8
# $ Name (chr) "Browse RStudio addins", "Colour picker", "ggplot2 Marginal Plots", "ggpl...
# $ Description (chr) "Browse and install RStudio addins", "Lets you easily select colours", "A...
# $ Package (chr) "addinslist", "shinyjs", "ggExtra", "ggThemeAssist", "taskscheduleR", "ja...
# $ On CRAN? (chr) "\u2705", "\u2705", "\u2705", "\u2705", "\u274c", "\u274c", "\u274c", "\u...
# $ On CRAN?_alias (list) white_check_mark, white_check_mark, white_check_mark, white_check_mark, ...
# $ Author (chr) "Dean Attali", "Dean Attali", "Dean Attali", "Calli Gross", "jwijffels", ...
# $ More links (chr) "Screenshot", "Screenshot, Demo video", "Screenshot", "Demo GIF", "Screen...
# $ Notes (chr) "", "", "", "", "", "", "", "", "", "Doesn't work on Windows", "", "", ""...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment