Last active
April 18, 2024 19:01
-
-
Save expersso/bb03efcab2a6c125da5ac22e1c33d070 to your computer and use it in GitHub Desktop.
Extracting HTML tables and arbitrary HTML attributes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPECIFIC PROBLEM: Extract an html table and include all links | |
# (even when there are no links or multiple links per cell) | |
# | |
# GENERAL PROBLEM: Extract an html table and include all attributes | |
# for any given tag and attribute | |
# | |
# Example usage at bottom of the page | |
# | |
# Motivation: https://twitter.com/daattali/status/717582654476914688 | |
library(rvest) | |
library(purrr) | |
#For a given tag, attribute and table node, extracts the attribute in entry i, j | |
get_attr <- function(i, j, tbl_node, tag, attr) { | |
xpath <- sprintf("//tr[position() = %s]//*[position() = %s]//%s", i, j, tag) | |
tbl_node %>% xml_find_all(paste0(xml_path(.), xpath)) %>% html_attr(attr) | |
} | |
# Given a DF and a function, returns a function that applies the given function | |
# to every element in the DF by column-major order | |
iterating_over <- function(df, f) { | |
j <- 1:ncol(df) | |
i <- 1:nrow(df) | |
function(...) { | |
map(j, function(j) { | |
map(i, function(i) { | |
f(i, j, ...) | |
}) | |
}) | |
} | |
} | |
# Given a DF, creates vector of names of the form | |
# c("var1", "var1_suffix", "var2", "var2_suffix", ...) | |
make_names <- function(df, suffix, sep = "_") { | |
nm <- names(df) | |
nm_suffix <- paste(names(df), suffix, sep = sep) | |
list(nm, nm_suffix) %>% | |
transpose() %>% | |
flatten() %>% | |
flatten_chr() | |
} | |
# Given a DF, drops all columns where all entries are empty | |
drop_empty_cols <- function(df) { | |
no_links_idx <- map_lgl(df, ~all(map_lgl(.x, is_empty))) | |
df[, !no_links_idx] | |
} | |
# Given a list of lists, zips them together into a single data frame | |
zip_dfs <- . %>% | |
transpose() %>% | |
map(function(col) map(col, cbind)) %>% | |
map(as.data.frame, stringsAsFactors = FALSE) %>% | |
reduce(cbind) | |
# Given a table node, tag, and attribute, creates a dataframe with entries and HTML attributes | |
extract_tbl_w_attr <- function(tbl_node, tag, attr, ...) { | |
tbl <- html_table(tbl_node, ...) | |
links <- iterating_over(tbl, get_attr)(tbl_node = tbl_node, tag = tag, attr = attr) | |
list(tbl, links) %>% | |
zip_dfs() %>% | |
set_names(make_names(tbl, suffix = attr)) %>% | |
drop_empty_cols() | |
} | |
# IO section | |
df <- "https://github.com/daattali/addinslist/blob/master/README.md#list-of-addins" %>% | |
read_html() %>% | |
xml_find_one("//table") %>% | |
extract_tbl_w_attr(tag = "a", attr = "href") | |
dplyr::glimpse(df) | |
# Observations: 28 | |
# Variables: 10 | |
# $ Name (chr) "Browse RStudio addins", "Colour picker", "ggplot2 Marginal Plots", "ggplot Theme Ass... | |
# $ Description (chr) "Browse and install RStudio addins", "Lets you easily select colours", "Add marginal ... | |
# $ Package (chr) "addinslist", "shinyjs", "ggExtra", "ggThemeAssist", "taskscheduleR", "jadd", "WrapRm... | |
# $ Package_href (list) https://github.com/daattali/addinslist, https://github.com/daattali/shinyjs, https:/... | |
# $ On CRAN? (chr) "\u2705", "\u2705", "\u2705", "\u2705", "\u274c", "\u274c", "\u274c", "\u274c", "\u27... | |
# $ Author (chr) "Dean Attali", "Dean Attali", "Dean Attali", "Calli Gross", "jwijffels", "Jenny Bryan... | |
# $ Author_href (list) http://deanattali.com/, http://deanattali.com/, http://deanattali.com/, https://gith... | |
# $ More links (chr) "Screenshot", "Screenshot, Demo video", "Screenshot", "Demo GIF", "Screenshot", "Demo... | |
# $ More links_href (list) https://raw.githubusercontent.com/daattali/addinslist/master/inst/media/addin.png, h... | |
# $ Notes (chr) "", "", "", "", "", "", "", "", "", "Doesn't work on Windows", "", "", "", "", "", ""... | |
# Also works for e.g. the "alias" attribute of the "g-emoji" tags | |
emojis <- "https://github.com/daattali/addinslist/blob/master/README.md#list-of-addins" %>% | |
read_html() %>% | |
xml_find_one("//table") %>% | |
extract_tbl_w_attr("g-emoji", "alias") | |
dplyr::glimpse(emojis) | |
# Observations: 28 | |
# Variables: 8 | |
# $ Name (chr) "Browse RStudio addins", "Colour picker", "ggplot2 Marginal Plots", "ggpl... | |
# $ Description (chr) "Browse and install RStudio addins", "Lets you easily select colours", "A... | |
# $ Package (chr) "addinslist", "shinyjs", "ggExtra", "ggThemeAssist", "taskscheduleR", "ja... | |
# $ On CRAN? (chr) "\u2705", "\u2705", "\u2705", "\u2705", "\u274c", "\u274c", "\u274c", "\u... | |
# $ On CRAN?_alias (list) white_check_mark, white_check_mark, white_check_mark, white_check_mark, ... | |
# $ Author (chr) "Dean Attali", "Dean Attali", "Dean Attali", "Calli Gross", "jwijffels", ... | |
# $ More links (chr) "Screenshot", "Screenshot, Demo video", "Screenshot", "Demo GIF", "Screen... | |
# $ Notes (chr) "", "", "", "", "", "", "", "", "", "Doesn't work on Windows", "", "", ""... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment