jmclawson · July 21, 2022 15:14
diff --git a/clean_eebo.R b/clean_eebo.R
 ##### Load libraries #####
 library(dplyr)
 library(stringi)
 library(stringr)
 library(tidyr)
 library(stringdist)
 library(tokenizers)

 ##### Set up replicable workflow #####
 # Set the directories to be used. Both directories should exist in project directory, and dir_start should include text files needing to be processed.
 dir_start <- "data/raw/"
 dir_end <- "data/processed/"

 ##### Read and study documents ##### 
 # This function loads a file as a vector of lines.
 get_lines <- function(file) {
  file |> 
  scan(what = character(),
       sep = "\n",
       quiet = TRUE)
 }

 # This basic function splits lines at each space. It's good for keeping punctuation and nonstandard characters.
 get_words_preliminary <- function(textlines) {
  textlines |> 
    stri_trans_general("latin-ascii") |> 
    strsplit(" ")
 }

 # This better function converts words to lowercase and strips punctuation. It's good for ignoring nonstandard characters.
 get_words <- function(textlines) {
  textlines |> 
    stri_trans_general("latin-ascii") |> 
    tokenize_words()
 }

 # This function simplifies words to frequency counts.
 get_table <- function(textwords) {
  the_table <- textwords |> 
    unlist() |> 
    table() |> 
    data.frame() |> 
    arrange(desc(Freq))
  
  the_table$Var1 <- as.character(the_table$Var1)
  colnames(the_table)[1] <- "Word"
  
  return(the_table)
 }

 # This function combines all of the above, keeping punctuation and nonstandard characters.
 get_process_preliminary <- function(file) {
  file |> 
    get_lines() |> 
    get_words_preliminary() |> 
    get_table()
 }

 # This function combines all of the above, ignoring nonstandard characters.
 get_process <- function(file) {
  file |> 
    get_lines() |> 
    get_words() |> 
    get_table()
 }

 # This function gets counts of all nonstandard characters.
 get_bad_characters <- function(list) {
  list |> 
    unlist() |> 
    # Ignore all letters, numbers, and most punctuation.
    gsub("[a-zA-Z,.:0-9?';() -]+","",x=_) |> 
    gsub("\\]","",x=_) |> 
    gsub("\\[","",x=_) |> 
    strsplit("") |> 
    unlist() |> 
    table()
 } 

 # Understand words in context
 get_context <- function(word, 
                        file=NA, 
                        window = 3, 
                        max = 3,
                        use_cleaned = FALSE,
                        fuzzy = FALSE,
                        try_fuzzy = TRUE,
                        check_lower = TRUE,
                        crayon = FALSE) {
  if (use_cleaned) {
    get_the_words <- function(...) {
      get_words(...)
    }
  } else {
    get_the_words <- function(...) {
      get_words_preliminary(...)
    }
  }
  if (!is.na(file)) {
    the_words <- file |> 
      get_lines() |> 
      get_the_words() |> 
      unlist()
  } else {
    the_words <- the_files |> 
      lapply(get_lines) |> 
      lapply(get_the_words) |> 
      unlist()
  }
  
  if (fuzzy) {
    # The "fuzzy" options allows for regular expressions and partial matches.
    index_word <- which(grepl(word, the_words))
  } else {
    index_word <- which(the_words==word)
  }
  
  # If there are no matches, try it in a way that ignores case sensitivity.
  if (length(index_word)==0 & !fuzzy & check_lower) {
    index_word <- which(tolower(the_words)==tolower(word))
  } else if (length(index_word)==0 & fuzzy & check_lower) {
    index_word <- which(grepl(tolower(word), tolower(the_words)))
  }
  
  if (length(index_word)==0 & !fuzzy & try_fuzzy & check_lower) {
    index_word <- which(grepl(tolower(word), tolower(the_words)))
  }
  
  if (length(index_word)==0) {
    warning(call. = FALSE, 
            paste0("The word '",
                   word,
                   "' isn't present."))
  } else if (length(index_word)>=max) {
    printit <- rep(NA, max)
    for (l in 1:max) {
      index_prephrase <- seq(index_word[l] - window,
                             index_word[l] - 1)
      
      index_postphrase <- seq(index_word[l] + 1,
                              index_word[l] + window)
      
      printit[l] <- the_words[index_prephrase] |> 
        c(toupper(the_words[index_word[l]]), 
          the_words[index_postphrase]) |> 
        paste(collapse = " ")
      
      if(crayon) {
        message(crayon::black(paste(the_words[index_prephrase], collapse = " ")), 
                " ", crayon::blue(the_words[index_word[l]]), " ",
                crayon::black(paste(the_words[index_postphrase], collapse = " ")))
      }
    }
    if(!crayon){printit}
  } else {
    printit <- rep(NA, length(index_word))
    for (l in 1:length(index_word)) {
      index_prephrase <- seq(index_word[l] - window,
                             index_word[l] - 1)
      
      index_postphrase <- seq(index_word[l] + 1,
                              index_word[l] + window)
      
      printit[l] <- the_words[index_prephrase] |> 
        c(toupper(the_words[index_word[l]]), 
          the_words[index_postphrase]) |> 
        paste(collapse = " ")
      
      if(crayon) {
        message(crayon::black(paste(the_words[index_prephrase], collapse = " ")), 
                " ", crayon::blue(the_words[index_word[l]]), " ",
                crayon::black(paste(the_words[index_postphrase], collapse = " ")))
      }
    }
    if(!crayon){printit}
  }
 }

 ##### Understand and distinguish "good" and "bad" words #####
 # Combine word frequencies among all documents in the corpus
 # This is helpful to see which words are most common (i.e. legitimate)
 combine_freqs <- function(words = words_list) {
  for (i in 1:length(words)) {
    words[[i]] <- words[[i]] |> 
      mutate(Freq = Freq/sum(Freq))
    
    colnames(words[[i]])[2] <- paste0("Freq", i)
  }
  
  combo_df <- words[[1]]
  
  for (i in 2:length(words)) {
    combo_df <- combo_df |> 
      full_join(words[[i]], by = "Word")
  }
  
  combo_df |> 
    pivot_longer(-Word, values_to = "frequency") |> 
    mutate(frequency = replace_na(frequency, 0)) |> 
    group_by(Word) |> 
    summarize(Average_Frequency = mean(frequency, 
                          na.rm=TRUE)) |> 
    arrange(desc(Average_Frequency))
 }

 # Set up and detect words needing to be cleaned
 setup_messiness <- function(prelim = preliminary_list) {
  # The messy_words are derived from the raw texts with some minor cleaning
  messy_words <- prelim |> 
    bind_rows() |>
    pull(Word) |> 
    strsplit("\t") |> 
    unlist() |> 
    strsplit("-") |> 
    unlist() |> 
    gsub("^'t","t",x = _) |> 
    gsub("[,.0-9:;?]","",x = _) |> 
    gsub("\\)","",x = _) |> 
    gsub("\\(","",x = _) |> 
    gsub("\\]","",x = _) |> 
    gsub("\\[","",x = _) |> 
    table() |> 
    data.frame() |> 
    pull(Var1)

  # What's left behind are all the words with nonstandard characters
  variant_spellings <- setdiff(messy_words |> tolower(), the_freqs$Word)

  # Finding the placement of these words will make it possible to see their contexts:
  variant_index <- 
    grep(paste0("^", variant_spellings, 
                "$", collapse="|"),
         messy_words |> tolower())
       
  # Make these objects available in the global environment
  messy_words <<- messy_words
  variant_spellings <<- variant_spellings
  variant_index <<- variant_index
 }

 # Automate converting bullets to regular expressions
 bullet2regex <- function(string) {
  splitup <- string |>
    strsplit("") |> 
    unlist() |> 
    rle() |> 
    {\(x) setNames(x$lengths, x$values)}()
  
  the_regex <- c()
  for (i in 1:length(splitup)){
    if (grepl("[a-z]", names(splitup)[i])) {
      the_regex[i] <- names(splitup)[i] |> 
        rep(unname(splitup[i])) |> 
        paste0(collapse = "")
    } else if (grepl("[•▪]", names(splitup)[i])) {
      the_regex[i] <- paste0("[a-z]{",
                             unname(splitup[i]),
                             "}")
    } else {
      the_regex[i] <- ""
    }
  }
  the_regex_2 <- the_regex |> 
    paste0(collapse = "")
  
  the_regex_3 <- paste0("^", the_regex_2, "$")
  
  return(the_regex_3)
 }

 match_regex <- function(string) {
  the_freqs$Word[grep(bullet2regex(string),the_freqs$Word)] |> 
    paste0(collapse = ", ")
 }

 ##### Train corrections #####
 # Set up messiness and training data frame
 setup_training <- function() {
  # First, set up messiness
  setup_messiness()

  train_df <- 
    data.frame(index = variant_index,
               variants = variant_spellings,
               variant_printed = messy_words[variant_index],
               guess1 = the_freqs$Word[amatch(variant_spellings, the_freqs$Word, maxDist = 1)],
               guess2 = the_freqs$Word[amatch(variant_spellings, the_freqs$Word, maxDist = 1, method = "soundex")]) |> 
    filter(variants != "") |> 
    rowwise() |> 
    mutate(matches = variants |> match_regex(),
           manual = NA,
           guess2 = case_when(
             guess2 == "the" ~ NA |> as.character(),
             TRUE ~ guess2
             )) |> 
    mutate(
      guess1 = case_when(
        is.na(guess1) & !is.na(matches) ~ matches |> 
          strsplit(",") |> 
          unlist() |> 
          {\(x) x[1]}() |> 
          trimws(),
        TRUE ~ guess1)) |> 
    mutate(
      guess2 = case_when(
        is.na(guess2) & !is.na(guess1) & !is.na(matches) ~ matches |> 
          strsplit(",") |> 
          unlist() |> 
          setdiff(guess1) |> 
          {\(x) x[1]}() |> 
          trimws(),
        TRUE ~ guess2)) |> 
    filter(grepl("[a-z]", variants))
  
  # Make available to global environment
  train_df <<- train_df
 }

 # Step through corrections
 make_corrections <- function(n = NA, examples = 1) {
  df <- train_df
  uncorrected_rows <- sum(is.na(df$manual))
  
    # If no n is provided, set to number of uncorrected rows
    if (is.na(n)) {
      n <- uncorrected_rows
    }
  
    # Limit n to number of uncorrected rows
    if (n > uncorrected_rows) {
      n <- uncorrected_rows
    }
  
  if (n == 0){
    message("All done!")
    return(df)
  }
    
    rows <- which(is.na(df$manual))[1:n]
    for (r in rows) {
      message(crayon::bold(crayon::black("Word: \n")),
              crayon::black(" - "), df$variants[r])
      message(crayon::bold(crayon::black("Context: \n")),
              paste0(crayon::black(" - "),
                     get_context(df$variants[r],
                                 window=3,
                                 max=examples) |> paste0(collapse="\n - ")) |> 
                {\(x) gsub(df$variants[r],
                           toupper(df$variants[r]),
                           x)}())
      message(crayon::bold(crayon::black("Matches: \n")),
              crayon::black(" - "), df$matches[r])
      message(crayon::bold(crayon::black("Guesses: \n")), 
              crayon::black("1 - "), df$guess1[r], "\n",
              crayon::black("2 - "), df$guess2[r], "\n",
              crayon::black("3 - (keep it as "), df$variants[r], crayon::black(")\n"),
              crayon::black(crayon::italic("(Or type another word.)")))
      correction <- readline(prompt="Correction: ")
      if (correction == 1) {
        correction <- df$guess1[r]
      } else if (correction == 2) {
        correction <- df$guess2[r]
      } else if (correction == 3) {
        correction <- df$variants[r]
      }
      df$manual[r] <- correction
    }
    
    # Save to global environment
    train_df <<- df
 }

 ##### Apply and process files #####

 # Use train_df to replace errors
 make_replacements <- function(text, replacement_set = train_df){
  replacer <- replacement_set |> 
    filter(variants != manual) |> 
    pull(manual) |> 
    setNames(replacement_set |> 
               filter(variants != manual) |> 
               pull(variant_printed))
  
  str_replace_all(text, replacer)
 }

 # Strip EEBO references to pages and images
 clean_pages <- function(text){
  text |> 
    stri_replace_all_regex(
        "Unnumbered page",
        "") |> 
      stri_replace_all_regex(
        "View Image - Image [0-9]+ \\(unnumbered page\\)",
        "") |> 
      stri_replace_all_regex(
        "Image [0-9]+ \\(unnumbered page\\)",
        "") |> 
      stri_replace_all_regex(
        "View Image - Image [0-9]+ \\(page [0-9]+\\)",
        "") |> 
      stri_replace_all_regex(
        "Image [0-9]+ \\(page [0-9]+\\)",
        "") |> 
      stri_replace_all_regex(
        "^Page [0-9]+$",
        "") 
 }

 # Clean other special characters and footnote references
 clean_special <- function(text){
  text |> 
    gsub("[¶/&•<>▪◊]"," ",x=_) |> 
    gsub("\\[[0-9]\\]"," ",x=_) |> 
    gsub("[ ]{2,}"," ",x=_) |> 
    trimws()
 }

 # Apply functions to file and save to dir_end
 process_file <- function(file) {
  new_path <- paste0(dir_end, basename(file))
  
  file |> 
    get_lines() |> 
    make_replacements() |> 
    clean_pages() |> 
    clean_special() |> 
    writeLines(new_path)
 }
	##### Load libraries #####
	library(dplyr)
	library(stringi)
	library(stringr)
	library(tidyr)
	library(stringdist)
	library(tokenizers)

	##### Set up replicable workflow #####
	# Set the directories to be used. Both directories should exist in project directory, and dir_start should include text files needing to be processed.
	dir_start <- "data/raw/"
	dir_end <- "data/processed/"

	##### Read and study documents #####
	# This function loads a file as a vector of lines.
	get_lines <- function(file) {
	file \|>
	scan(what = character(),
	sep = "\n",
	quiet = TRUE)
	}

	# This basic function splits lines at each space. It's good for keeping punctuation and nonstandard characters.
	get_words_preliminary <- function(textlines) {
	textlines \|>
	stri_trans_general("latin-ascii") \|>
	strsplit(" ")
	}

	# This better function converts words to lowercase and strips punctuation. It's good for ignoring nonstandard characters.
	get_words <- function(textlines) {
	textlines \|>
	stri_trans_general("latin-ascii") \|>
	tokenize_words()
	}

	# This function simplifies words to frequency counts.
	get_table <- function(textwords) {
	the_table <- textwords \|>
	unlist() \|>
	table() \|>
	data.frame() \|>
	arrange(desc(Freq))

	the_table$Var1 <- as.character(the_table$Var1)
	colnames(the_table)[1] <- "Word"

	return(the_table)
	}

	# This function combines all of the above, keeping punctuation and nonstandard characters.
	get_process_preliminary <- function(file) {
	file \|>
	get_lines() \|>
	get_words_preliminary() \|>
	get_table()
	}

	# This function combines all of the above, ignoring nonstandard characters.
	get_process <- function(file) {
	file \|>
	get_lines() \|>
	get_words() \|>
	get_table()
	}

	# This function gets counts of all nonstandard characters.
	get_bad_characters <- function(list) {
	list \|>
	unlist() \|>
	# Ignore all letters, numbers, and most punctuation.
	gsub("[a-zA-Z,.:0-9?';() -]+","",x=_) \|>
	gsub("\\]","",x=_) \|>
	gsub("\\[","",x=_) \|>
	strsplit("") \|>
	unlist() \|>
	table()
	}

	# Understand words in context
	get_context <- function(word,
	file=NA,
	window = 3,
	max = 3,
	use_cleaned = FALSE,
	fuzzy = FALSE,
	try_fuzzy = TRUE,
	check_lower = TRUE,
	crayon = FALSE) {
	if (use_cleaned) {
	get_the_words <- function(...) {
	get_words(...)
	}
	} else {
	get_the_words <- function(...) {
	get_words_preliminary(...)
	}
	}
	if (!is.na(file)) {
	the_words <- file \|>
	get_lines() \|>
	get_the_words() \|>
	unlist()
	} else {
	the_words <- the_files \|>
	lapply(get_lines) \|>
	lapply(get_the_words) \|>
	unlist()
	}

	if (fuzzy) {
	# The "fuzzy" options allows for regular expressions and partial matches.
	index_word <- which(grepl(word, the_words))
	} else {
	index_word <- which(the_words==word)
	}

	# If there are no matches, try it in a way that ignores case sensitivity.
	if (length(index_word)==0 & !fuzzy & check_lower) {
	index_word <- which(tolower(the_words)==tolower(word))
	} else if (length(index_word)==0 & fuzzy & check_lower) {
	index_word <- which(grepl(tolower(word), tolower(the_words)))
	}

	if (length(index_word)==0 & !fuzzy & try_fuzzy & check_lower) {
	index_word <- which(grepl(tolower(word), tolower(the_words)))
	}

	if (length(index_word)==0) {
	warning(call. = FALSE,
	paste0("The word '",
	word,
	"' isn't present."))
	} else if (length(index_word)>=max) {
	printit <- rep(NA, max)
	for (l in 1:max) {
	index_prephrase <- seq(index_word[l] - window,
	index_word[l] - 1)

	index_postphrase <- seq(index_word[l] + 1,
	index_word[l] + window)

	printit[l] <- the_words[index_prephrase] \|>
	c(toupper(the_words[index_word[l]]),
	the_words[index_postphrase]) \|>
	paste(collapse = " ")

	if(crayon) {
	message(crayon::black(paste(the_words[index_prephrase], collapse = " ")),
	" ", crayon::blue(the_words[index_word[l]]), " ",
	crayon::black(paste(the_words[index_postphrase], collapse = " ")))
	}
	}
	if(!crayon){printit}
	} else {
	printit <- rep(NA, length(index_word))
	for (l in 1:length(index_word)) {
	index_prephrase <- seq(index_word[l] - window,
	index_word[l] - 1)

	index_postphrase <- seq(index_word[l] + 1,
	index_word[l] + window)

	printit[l] <- the_words[index_prephrase] \|>
	c(toupper(the_words[index_word[l]]),
	the_words[index_postphrase]) \|>
	paste(collapse = " ")

	if(crayon) {
	message(crayon::black(paste(the_words[index_prephrase], collapse = " ")),
	" ", crayon::blue(the_words[index_word[l]]), " ",
	crayon::black(paste(the_words[index_postphrase], collapse = " ")))
	}
	}
	if(!crayon){printit}
	}
	}

	##### Understand and distinguish "good" and "bad" words #####
	# Combine word frequencies among all documents in the corpus
	# This is helpful to see which words are most common (i.e. legitimate)
	combine_freqs <- function(words = words_list) {
	for (i in 1:length(words)) {
	words[[i]] <- words[[i]] \|>
	mutate(Freq = Freq/sum(Freq))

	colnames(words[[i]])[2] <- paste0("Freq", i)
	}

	combo_df <- words[[1]]

	for (i in 2:length(words)) {
	combo_df <- combo_df \|>
	full_join(words[[i]], by = "Word")
	}

	combo_df \|>
	pivot_longer(-Word, values_to = "frequency") \|>
	mutate(frequency = replace_na(frequency, 0)) \|>
	group_by(Word) \|>
	summarize(Average_Frequency = mean(frequency,
	na.rm=TRUE)) \|>
	arrange(desc(Average_Frequency))
	}

	# Set up and detect words needing to be cleaned
	setup_messiness <- function(prelim = preliminary_list) {
	# The messy_words are derived from the raw texts with some minor cleaning
	messy_words <- prelim \|>
	bind_rows() \|>
	pull(Word) \|>
	strsplit("\t") \|>
	unlist() \|>
	strsplit("-") \|>
	unlist() \|>
	gsub("^'t","t",x = _) \|>
	gsub("[,.0-9:;?]","",x = _) \|>
	gsub("\\)","",x = _) \|>
	gsub("\\(","",x = _) \|>
	gsub("\\]","",x = _) \|>
	gsub("\\[","",x = _) \|>
	table() \|>
	data.frame() \|>
	pull(Var1)

	# What's left behind are all the words with nonstandard characters
	variant_spellings <- setdiff(messy_words \|> tolower(), the_freqs$Word)

	# Finding the placement of these words will make it possible to see their contexts:
	variant_index <-
	grep(paste0("^", variant_spellings,
	"$", collapse="\|"),
	messy_words \|> tolower())

	# Make these objects available in the global environment
	messy_words <<- messy_words
	variant_spellings <<- variant_spellings
	variant_index <<- variant_index
	}

	# Automate converting bullets to regular expressions
	bullet2regex <- function(string) {
	splitup <- string \|>
	strsplit("") \|>
	unlist() \|>
	rle() \|>
	{\(x) setNames(x$lengths, x$values)}()

	the_regex <- c()
	for (i in 1:length(splitup)){
	if (grepl("[a-z]", names(splitup)[i])) {
	the_regex[i] <- names(splitup)[i] \|>
	rep(unname(splitup[i])) \|>
	paste0(collapse = "")
	} else if (grepl("[•▪]", names(splitup)[i])) {
	the_regex[i] <- paste0("[a-z]{",
	unname(splitup[i]),
	"}")
	} else {
	the_regex[i] <- ""
	}
	}
	the_regex_2 <- the_regex \|>
	paste0(collapse = "")

	the_regex_3 <- paste0("^", the_regex_2, "$")

	return(the_regex_3)
	}

	match_regex <- function(string) {
	the_freqs$Word[grep(bullet2regex(string),the_freqs$Word)] \|>
	paste0(collapse = ", ")
	}

	##### Train corrections #####
	# Set up messiness and training data frame
	setup_training <- function() {
	# First, set up messiness
	setup_messiness()

	train_df <-
	data.frame(index = variant_index,
	variants = variant_spellings,
	variant_printed = messy_words[variant_index],
	guess1 = the_freqs$Word[amatch(variant_spellings, the_freqs$Word, maxDist = 1)],
	guess2 = the_freqs$Word[amatch(variant_spellings, the_freqs$Word, maxDist = 1, method = "soundex")]) \|>
	filter(variants != "") \|>
	rowwise() \|>
	mutate(matches = variants \|> match_regex(),
	manual = NA,
	guess2 = case_when(
	guess2 == "the" ~ NA \|> as.character(),
	TRUE ~ guess2
	)) \|>
	mutate(
	guess1 = case_when(
	is.na(guess1) & !is.na(matches) ~ matches \|>
	strsplit(",") \|>
	unlist() \|>
	{\(x) x[1]}() \|>
	trimws(),
	TRUE ~ guess1)) \|>
	mutate(
	guess2 = case_when(
	is.na(guess2) & !is.na(guess1) & !is.na(matches) ~ matches \|>
	strsplit(",") \|>
	unlist() \|>
	setdiff(guess1) \|>
	{\(x) x[1]}() \|>
	trimws(),
	TRUE ~ guess2)) \|>
	filter(grepl("[a-z]", variants))

	# Make available to global environment
	train_df <<- train_df
	}

	# Step through corrections
	make_corrections <- function(n = NA, examples = 1) {
	df <- train_df
	uncorrected_rows <- sum(is.na(df$manual))

	# If no n is provided, set to number of uncorrected rows
	if (is.na(n)) {
	n <- uncorrected_rows
	}

	# Limit n to number of uncorrected rows
	if (n > uncorrected_rows) {
	n <- uncorrected_rows
	}

	if (n == 0){
	message("All done!")
	return(df)
	}

	rows <- which(is.na(df$manual))[1:n]
	for (r in rows) {
	message(crayon::bold(crayon::black("Word: \n")),
	crayon::black(" - "), df$variants[r])
	message(crayon::bold(crayon::black("Context: \n")),
	paste0(crayon::black(" - "),
	get_context(df$variants[r],
	window=3,
	max=examples) \|> paste0(collapse="\n - ")) \|>
	{\(x) gsub(df$variants[r],
	toupper(df$variants[r]),
	x)}())
	message(crayon::bold(crayon::black("Matches: \n")),
	crayon::black(" - "), df$matches[r])
	message(crayon::bold(crayon::black("Guesses: \n")),
	crayon::black("1 - "), df$guess1[r], "\n",
	crayon::black("2 - "), df$guess2[r], "\n",
	crayon::black("3 - (keep it as "), df$variants[r], crayon::black(")\n"),
	crayon::black(crayon::italic("(Or type another word.)")))
	correction <- readline(prompt="Correction: ")
	if (correction == 1) {
	correction <- df$guess1[r]
	} else if (correction == 2) {
	correction <- df$guess2[r]
	} else if (correction == 3) {
	correction <- df$variants[r]
	}
	df$manual[r] <- correction
	}

	# Save to global environment
	train_df <<- df
	}

	##### Apply and process files #####

	# Use train_df to replace errors
	make_replacements <- function(text, replacement_set = train_df){
	replacer <- replacement_set \|>
	filter(variants != manual) \|>
	pull(manual) \|>
	setNames(replacement_set \|>
	filter(variants != manual) \|>
	pull(variant_printed))

	str_replace_all(text, replacer)
	}

	# Strip EEBO references to pages and images
	clean_pages <- function(text){
	text \|>
	stri_replace_all_regex(
	"Unnumbered page",
	"") \|>
	stri_replace_all_regex(
	"View Image - Image [0-9]+ \\(unnumbered page\\)",
	"") \|>
	stri_replace_all_regex(
	"Image [0-9]+ \\(unnumbered page\\)",
	"") \|>
	stri_replace_all_regex(
	"View Image - Image [0-9]+ \\(page [0-9]+\\)",
	"") \|>
	stri_replace_all_regex(
	"Image [0-9]+ \\(page [0-9]+\\)",
	"") \|>
	stri_replace_all_regex(
	"^Page [0-9]+$",
	"")
	}

	# Clean other special characters and footnote references
	clean_special <- function(text){
	text \|>
	gsub("[¶/&•<>▪◊]"," ",x=_) \|>
	gsub("\\[[0-9]\\]"," ",x=_) \|>
	gsub("[ ]{2,}"," ",x=_) \|>
	trimws()
	}

	# Apply functions to file and save to dir_end
	process_file <- function(file) {
	new_path <- paste0(dir_end, basename(file))

	file \|>
	get_lines() \|>
	make_replacements() \|>
	clean_pages() \|>
	clean_special() \|>
	writeLines(new_path)
	}