Skip to content

Instantly share code, notes, and snippets.

@jmclawson
Last active July 21, 2022 15:14
Show Gist options
  • Save jmclawson/a52add7fc12c337b45a188f14c5ae227 to your computer and use it in GitHub Desktop.
Save jmclawson/a52add7fc12c337b45a188f14c5ae227 to your computer and use it in GitHub Desktop.
Preprocess EEBO TCP full text, cleaning OCR blips and removing page references
##### Load libraries #####
library(dplyr)
library(stringi)
library(stringr)
library(tidyr)
library(stringdist)
library(tokenizers)
##### Set up replicable workflow #####
# Set the directories to be used. Both directories should exist in project directory, and dir_start should include text files needing to be processed.
dir_start <- "data/raw/"
dir_end <- "data/processed/"
##### Read and study documents #####
# This function loads a file as a vector of lines.
get_lines <- function(file) {
file |>
scan(what = character(),
sep = "\n",
quiet = TRUE)
}
# This basic function splits lines at each space. It's good for keeping punctuation and nonstandard characters.
get_words_preliminary <- function(textlines) {
textlines |>
stri_trans_general("latin-ascii") |>
strsplit(" ")
}
# This better function converts words to lowercase and strips punctuation. It's good for ignoring nonstandard characters.
get_words <- function(textlines) {
textlines |>
stri_trans_general("latin-ascii") |>
tokenize_words()
}
# This function simplifies words to frequency counts.
get_table <- function(textwords) {
the_table <- textwords |>
unlist() |>
table() |>
data.frame() |>
arrange(desc(Freq))
the_table$Var1 <- as.character(the_table$Var1)
colnames(the_table)[1] <- "Word"
return(the_table)
}
# This function combines all of the above, keeping punctuation and nonstandard characters.
get_process_preliminary <- function(file) {
file |>
get_lines() |>
get_words_preliminary() |>
get_table()
}
# This function combines all of the above, ignoring nonstandard characters.
get_process <- function(file) {
file |>
get_lines() |>
get_words() |>
get_table()
}
# This function gets counts of all nonstandard characters.
get_bad_characters <- function(list) {
list |>
unlist() |>
# Ignore all letters, numbers, and most punctuation.
gsub("[a-zA-Z,.:0-9?';() -]+","",x=_) |>
gsub("\\]","",x=_) |>
gsub("\\[","",x=_) |>
strsplit("") |>
unlist() |>
table()
}
# Understand words in context
get_context <- function(word,
file=NA,
window = 3,
max = 3,
use_cleaned = FALSE,
fuzzy = FALSE,
try_fuzzy = TRUE,
check_lower = TRUE,
crayon = FALSE) {
if (use_cleaned) {
get_the_words <- function(...) {
get_words(...)
}
} else {
get_the_words <- function(...) {
get_words_preliminary(...)
}
}
if (!is.na(file)) {
the_words <- file |>
get_lines() |>
get_the_words() |>
unlist()
} else {
the_words <- the_files |>
lapply(get_lines) |>
lapply(get_the_words) |>
unlist()
}
if (fuzzy) {
# The "fuzzy" options allows for regular expressions and partial matches.
index_word <- which(grepl(word, the_words))
} else {
index_word <- which(the_words==word)
}
# If there are no matches, try it in a way that ignores case sensitivity.
if (length(index_word)==0 & !fuzzy & check_lower) {
index_word <- which(tolower(the_words)==tolower(word))
} else if (length(index_word)==0 & fuzzy & check_lower) {
index_word <- which(grepl(tolower(word), tolower(the_words)))
}
if (length(index_word)==0 & !fuzzy & try_fuzzy & check_lower) {
index_word <- which(grepl(tolower(word), tolower(the_words)))
}
if (length(index_word)==0) {
warning(call. = FALSE,
paste0("The word '",
word,
"' isn't present."))
} else if (length(index_word)>=max) {
printit <- rep(NA, max)
for (l in 1:max) {
index_prephrase <- seq(index_word[l] - window,
index_word[l] - 1)
index_postphrase <- seq(index_word[l] + 1,
index_word[l] + window)
printit[l] <- the_words[index_prephrase] |>
c(toupper(the_words[index_word[l]]),
the_words[index_postphrase]) |>
paste(collapse = " ")
if(crayon) {
message(crayon::black(paste(the_words[index_prephrase], collapse = " ")),
" ", crayon::blue(the_words[index_word[l]]), " ",
crayon::black(paste(the_words[index_postphrase], collapse = " ")))
}
}
if(!crayon){printit}
} else {
printit <- rep(NA, length(index_word))
for (l in 1:length(index_word)) {
index_prephrase <- seq(index_word[l] - window,
index_word[l] - 1)
index_postphrase <- seq(index_word[l] + 1,
index_word[l] + window)
printit[l] <- the_words[index_prephrase] |>
c(toupper(the_words[index_word[l]]),
the_words[index_postphrase]) |>
paste(collapse = " ")
if(crayon) {
message(crayon::black(paste(the_words[index_prephrase], collapse = " ")),
" ", crayon::blue(the_words[index_word[l]]), " ",
crayon::black(paste(the_words[index_postphrase], collapse = " ")))
}
}
if(!crayon){printit}
}
}
##### Understand and distinguish "good" and "bad" words #####
# Combine word frequencies among all documents in the corpus
# This is helpful to see which words are most common (i.e. legitimate)
combine_freqs <- function(words = words_list) {
for (i in 1:length(words)) {
words[[i]] <- words[[i]] |>
mutate(Freq = Freq/sum(Freq))
colnames(words[[i]])[2] <- paste0("Freq", i)
}
combo_df <- words[[1]]
for (i in 2:length(words)) {
combo_df <- combo_df |>
full_join(words[[i]], by = "Word")
}
combo_df |>
pivot_longer(-Word, values_to = "frequency") |>
mutate(frequency = replace_na(frequency, 0)) |>
group_by(Word) |>
summarize(Average_Frequency = mean(frequency,
na.rm=TRUE)) |>
arrange(desc(Average_Frequency))
}
# Set up and detect words needing to be cleaned
setup_messiness <- function(prelim = preliminary_list) {
# The messy_words are derived from the raw texts with some minor cleaning
messy_words <- prelim |>
bind_rows() |>
pull(Word) |>
strsplit("\t") |>
unlist() |>
strsplit("-") |>
unlist() |>
gsub("^'t","t",x = _) |>
gsub("[,.0-9:;?]","",x = _) |>
gsub("\\)","",x = _) |>
gsub("\\(","",x = _) |>
gsub("\\]","",x = _) |>
gsub("\\[","",x = _) |>
table() |>
data.frame() |>
pull(Var1)
# What's left behind are all the words with nonstandard characters
variant_spellings <- setdiff(messy_words |> tolower(), the_freqs$Word)
# Finding the placement of these words will make it possible to see their contexts:
variant_index <-
grep(paste0("^", variant_spellings,
"$", collapse="|"),
messy_words |> tolower())
# Make these objects available in the global environment
messy_words <<- messy_words
variant_spellings <<- variant_spellings
variant_index <<- variant_index
}
# Automate converting bullets to regular expressions
bullet2regex <- function(string) {
splitup <- string |>
strsplit("") |>
unlist() |>
rle() |>
{\(x) setNames(x$lengths, x$values)}()
the_regex <- c()
for (i in 1:length(splitup)){
if (grepl("[a-z]", names(splitup)[i])) {
the_regex[i] <- names(splitup)[i] |>
rep(unname(splitup[i])) |>
paste0(collapse = "")
} else if (grepl("[•▪]", names(splitup)[i])) {
the_regex[i] <- paste0("[a-z]{",
unname(splitup[i]),
"}")
} else {
the_regex[i] <- ""
}
}
the_regex_2 <- the_regex |>
paste0(collapse = "")
the_regex_3 <- paste0("^", the_regex_2, "$")
return(the_regex_3)
}
match_regex <- function(string) {
the_freqs$Word[grep(bullet2regex(string),the_freqs$Word)] |>
paste0(collapse = ", ")
}
##### Train corrections #####
# Set up messiness and training data frame
setup_training <- function() {
# First, set up messiness
setup_messiness()
train_df <-
data.frame(index = variant_index,
variants = variant_spellings,
variant_printed = messy_words[variant_index],
guess1 = the_freqs$Word[amatch(variant_spellings, the_freqs$Word, maxDist = 1)],
guess2 = the_freqs$Word[amatch(variant_spellings, the_freqs$Word, maxDist = 1, method = "soundex")]) |>
filter(variants != "") |>
rowwise() |>
mutate(matches = variants |> match_regex(),
manual = NA,
guess2 = case_when(
guess2 == "the" ~ NA |> as.character(),
TRUE ~ guess2
)) |>
mutate(
guess1 = case_when(
is.na(guess1) & !is.na(matches) ~ matches |>
strsplit(",") |>
unlist() |>
{\(x) x[1]}() |>
trimws(),
TRUE ~ guess1)) |>
mutate(
guess2 = case_when(
is.na(guess2) & !is.na(guess1) & !is.na(matches) ~ matches |>
strsplit(",") |>
unlist() |>
setdiff(guess1) |>
{\(x) x[1]}() |>
trimws(),
TRUE ~ guess2)) |>
filter(grepl("[a-z]", variants))
# Make available to global environment
train_df <<- train_df
}
# Step through corrections
make_corrections <- function(n = NA, examples = 1) {
df <- train_df
uncorrected_rows <- sum(is.na(df$manual))
# If no n is provided, set to number of uncorrected rows
if (is.na(n)) {
n <- uncorrected_rows
}
# Limit n to number of uncorrected rows
if (n > uncorrected_rows) {
n <- uncorrected_rows
}
if (n == 0){
message("All done!")
return(df)
}
rows <- which(is.na(df$manual))[1:n]
for (r in rows) {
message(crayon::bold(crayon::black("Word: \n")),
crayon::black(" - "), df$variants[r])
message(crayon::bold(crayon::black("Context: \n")),
paste0(crayon::black(" - "),
get_context(df$variants[r],
window=3,
max=examples) |> paste0(collapse="\n - ")) |>
{\(x) gsub(df$variants[r],
toupper(df$variants[r]),
x)}())
message(crayon::bold(crayon::black("Matches: \n")),
crayon::black(" - "), df$matches[r])
message(crayon::bold(crayon::black("Guesses: \n")),
crayon::black("1 - "), df$guess1[r], "\n",
crayon::black("2 - "), df$guess2[r], "\n",
crayon::black("3 - (keep it as "), df$variants[r], crayon::black(")\n"),
crayon::black(crayon::italic("(Or type another word.)")))
correction <- readline(prompt="Correction: ")
if (correction == 1) {
correction <- df$guess1[r]
} else if (correction == 2) {
correction <- df$guess2[r]
} else if (correction == 3) {
correction <- df$variants[r]
}
df$manual[r] <- correction
}
# Save to global environment
train_df <<- df
}
##### Apply and process files #####
# Use train_df to replace errors
make_replacements <- function(text, replacement_set = train_df){
replacer <- replacement_set |>
filter(variants != manual) |>
pull(manual) |>
setNames(replacement_set |>
filter(variants != manual) |>
pull(variant_printed))
str_replace_all(text, replacer)
}
# Strip EEBO references to pages and images
clean_pages <- function(text){
text |>
stri_replace_all_regex(
"Unnumbered page",
"") |>
stri_replace_all_regex(
"View Image - Image [0-9]+ \\(unnumbered page\\)",
"") |>
stri_replace_all_regex(
"Image [0-9]+ \\(unnumbered page\\)",
"") |>
stri_replace_all_regex(
"View Image - Image [0-9]+ \\(page [0-9]+\\)",
"") |>
stri_replace_all_regex(
"Image [0-9]+ \\(page [0-9]+\\)",
"") |>
stri_replace_all_regex(
"^Page [0-9]+$",
"")
}
# Clean other special characters and footnote references
clean_special <- function(text){
text |>
gsub("[¶/&•<>▪◊]"," ",x=_) |>
gsub("\\[[0-9]\\]"," ",x=_) |>
gsub("[ ]{2,}"," ",x=_) |>
trimws()
}
# Apply functions to file and save to dir_end
process_file <- function(file) {
new_path <- paste0(dir_end, basename(file))
file |>
get_lines() |>
make_replacements() |>
clean_pages() |>
clean_special() |>
writeLines(new_path)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment