Last active
March 26, 2025 00:39
-
-
Save noamross/8071cc27c76f66683be287c8827f0cf8 to your computer and use it in GitHub Desktop.
Get results of a TAGGS saved search with browser automation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Use web scraping a headless browser to get the latest CSV of the saved TAGGS search | |
#' | |
#' The system should have chrome installed | |
fetch_taggs_search <- function(taggs_search_url, output_filename) { | |
if(!requireNamespace("chromote", quietly = TRUE)) { | |
install.packages('chromote') | |
} | |
suppressPackageStartupMessages({ | |
library(selenider) | |
library(chromote) | |
library(fs) | |
}) | |
tmp_download_dir <- file_temp("dir") | |
dir_create(tmp_download_dir) | |
session <- selenider_session() | |
chromote_session <- session$driver | |
chromote_session$Browser$setDownloadBehavior( | |
behavior = "allow", | |
downloadPath = tmp_download_dir | |
) | |
open_url(taggs_search_url) | |
csv_button <- s("div#btnExportToCSVSearchAdvExport_AdvSearchFilter") | |
elem_click(csv_button) | |
# Wait for the download to complete | |
while(!length(dir_ls(tmp_download_dir, regex = "\\.csv$"))) { | |
Sys.sleep(1) | |
} | |
close_session(session) | |
# Create a filename in UDT | |
# The filename is always different, so move whatever CSV just downloaded | |
file_move(dir_ls(tmp_download_dir, regex = "\\.csv$"), output_filename) | |
output_filename | |
} | |
#' Parse a TAGGS CSV where you include the abstract in the search results | |
process_taggs_search <- function(taggs_search_csv) { | |
#read in the file assuming windows encoding | |
taggs_search_raw <- readr::read_csv(taggs_search_csv, skip = 1, locale = readr::locale(encoding = "windows-1252")) |> | |
head(-2) |> | |
janitor::clean_names() | |
taggs_search_raw$abstract <- NA | |
for (i in seq_len(nrow(taggs_search_raw))) { | |
if(is.na(taggs_search_raw[i, "uei"])) { | |
taggs_search_raw[i - 1, "abstract"] <- taggs_search_raw[i, "issue_date_fiscal_year"] | |
} | |
} | |
taggs_search <- taggs_search_raw |> | |
filter(!is.na(uei)) |> | |
mutate(sum_of_actions = readr::parse_number(stri_replace_first_fixed(sum_of_actions, "$", ""))) |> | |
mutate(action_issue_date = lubdridate::mdy(action_issue_date)) | |
taggs_search | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment