Skip to content

Instantly share code, notes, and snippets.

@noamross
Last active March 26, 2025 00:39
Show Gist options
  • Save noamross/8071cc27c76f66683be287c8827f0cf8 to your computer and use it in GitHub Desktop.
Save noamross/8071cc27c76f66683be287c8827f0cf8 to your computer and use it in GitHub Desktop.
Get results of a TAGGS saved search with browser automation
#' Use web scraping a headless browser to get the latest CSV of the saved TAGGS search
#'
#' The system should have chrome installed
fetch_taggs_search <- function(taggs_search_url, output_filename) {
if(!requireNamespace("chromote", quietly = TRUE)) {
install.packages('chromote')
}
suppressPackageStartupMessages({
library(selenider)
library(chromote)
library(fs)
})
tmp_download_dir <- file_temp("dir")
dir_create(tmp_download_dir)
session <- selenider_session()
chromote_session <- session$driver
chromote_session$Browser$setDownloadBehavior(
behavior = "allow",
downloadPath = tmp_download_dir
)
open_url(taggs_search_url)
csv_button <- s("div#btnExportToCSVSearchAdvExport_AdvSearchFilter")
elem_click(csv_button)
# Wait for the download to complete
while(!length(dir_ls(tmp_download_dir, regex = "\\.csv$"))) {
Sys.sleep(1)
}
close_session(session)
# Create a filename in UDT
# The filename is always different, so move whatever CSV just downloaded
file_move(dir_ls(tmp_download_dir, regex = "\\.csv$"), output_filename)
output_filename
}
#' Parse a TAGGS CSV where you include the abstract in the search results
process_taggs_search <- function(taggs_search_csv) {
#read in the file assuming windows encoding
taggs_search_raw <- readr::read_csv(taggs_search_csv, skip = 1, locale = readr::locale(encoding = "windows-1252")) |>
head(-2) |>
janitor::clean_names()
taggs_search_raw$abstract <- NA
for (i in seq_len(nrow(taggs_search_raw))) {
if(is.na(taggs_search_raw[i, "uei"])) {
taggs_search_raw[i - 1, "abstract"] <- taggs_search_raw[i, "issue_date_fiscal_year"]
}
}
taggs_search <- taggs_search_raw |>
filter(!is.na(uei)) |>
mutate(sum_of_actions = readr::parse_number(stri_replace_first_fixed(sum_of_actions, "$", ""))) |>
mutate(action_issue_date = lubdridate::mdy(action_issue_date))
taggs_search
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment