jmclawson · November 4, 2023 16:36 · jmclawson · Nov 4, 2023
diff --git a/corpus_micusp.R b/corpus_micusp.R
 # helper function get_if_needed for downloading online documents exactly once: https://gist.github.com/jmclawson/65899e2de6bfee692b08141a98422240
 source("https://gist.githubusercontent.com/jmclawson/65899e2de6bfee692b08141a98422240/raw/7c5590377332e427691f2331b69abd58be2141ec/get_if_needed.R")

 get_micusp_metadata <- function(micusp_dir = "micusp"){
  get_if_needed("https://elicorpora.info/browse?mode=download&start=1&sort=dept&direction=desc",
                filename = "micusp_metadata.csv",
                destdir = micusp_dir)
  
  readr::read_csv("micusp/micusp_metadata.csv", show_col_types = FALSE) |> 
    janitor::clean_names()
 }

 parse_micusp_paper <- function(paperid,
                               htmldir = "micusp/corpus_html",
                               textdir = "micusp/corpus"){
  filename_text <- paperid |> 
    stringr::str_replace_all("[.]","_") |> 
    paste0(".txt") |> 
    {\(x) paste0(textdir,"/",x)}()
  
  filename_html <- paperid |> 
    stringr::str_replace_all("[.]","_") |> 
    paste0(".html") |> 
    {\(x) paste0(htmldir,"/",x)}()
  
  if(!dir.exists(textdir)){dir.create(textdir)}
  if(!file.exists(filename_text)){
    filename_html |>
      rvest::read_html() |> 
      rvest::html_element(css = "div#paperBody") |> 
      rvest::html_text() |> 
      readr::write_lines(filename_text)
  }
  
  readr::read_lines(filename_text) |> 
    paste0(collapse = "\n")
 }

 get_micusp_corpus <- function(...){
  the_df <- 
    get_micusp_metadata() |> 
    dplyr::filter(...)
  
  the_urls <- 
    the_df |> 
    dplyr::pull(paper_id) |> 
    {\(x) paste0("https://elicorpora.info/view?pid=", x)}()
  
  the_filenames <- 
    the_df |> 
    dplyr::pull(paper_id) |> 
    stringr::str_replace_all("[.]", "_") |> 
    paste0(".html")
  
  the_urls |> 
    purrr::walk2(.x = the_urls, 
          .y = the_filenames, 
          .f = ~ get_if_needed(.x, .y, destdir = "micusp/corpus_html"))
  
  the_df |> 
    dplyr::rowwise() |>
    dplyr::mutate(text = parse_micusp_paper(paper_id))
 }
	# helper function get_if_needed for downloading online documents exactly once: https://gist.github.com/jmclawson/65899e2de6bfee692b08141a98422240
	source("https://gist.githubusercontent.com/jmclawson/65899e2de6bfee692b08141a98422240/raw/7c5590377332e427691f2331b69abd58be2141ec/get_if_needed.R")

	get_micusp_metadata <- function(micusp_dir = "micusp"){
	get_if_needed("https://elicorpora.info/browse?mode=download&start=1&sort=dept&direction=desc",
	filename = "micusp_metadata.csv",
	destdir = micusp_dir)

	readr::read_csv("micusp/micusp_metadata.csv", show_col_types = FALSE) \|>
	janitor::clean_names()
	}

	parse_micusp_paper <- function(paperid,
	htmldir = "micusp/corpus_html",
	textdir = "micusp/corpus"){
	filename_text <- paperid \|>
	stringr::str_replace_all("[.]","_") \|>
	paste0(".txt") \|>
	{\(x) paste0(textdir,"/",x)}()

	filename_html <- paperid \|>
	stringr::str_replace_all("[.]","_") \|>
	paste0(".html") \|>
	{\(x) paste0(htmldir,"/",x)}()

	if(!dir.exists(textdir)){dir.create(textdir)}
	if(!file.exists(filename_text)){
	filename_html \|>
	rvest::read_html() \|>
	rvest::html_element(css = "div#paperBody") \|>
	rvest::html_text() \|>
	readr::write_lines(filename_text)
	}

	readr::read_lines(filename_text) \|>
	paste0(collapse = "\n")
	}

	get_micusp_corpus <- function(...){
	the_df <-
	get_micusp_metadata() \|>
	dplyr::filter(...)

	the_urls <-
	the_df \|>
	dplyr::pull(paper_id) \|>
	{\(x) paste0("https://elicorpora.info/view?pid=", x)}()

	the_filenames <-
	the_df \|>
	dplyr::pull(paper_id) \|>
	stringr::str_replace_all("[.]", "_") \|>
	paste0(".html")

	the_urls \|>
	purrr::walk2(.x = the_urls,
	.y = the_filenames,
	.f = ~ get_if_needed(.x, .y, destdir = "micusp/corpus_html"))

	the_df \|>
	dplyr::rowwise() \|>
	dplyr::mutate(text = parse_micusp_paper(paper_id))
	}