Last active
July 21, 2022 15:14
-
-
Save jmclawson/a52add7fc12c337b45a188f14c5ae227 to your computer and use it in GitHub Desktop.
Preprocess EEBO TCP full text, cleaning OCR blips and removing page references
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##### Load libraries ##### | |
library(dplyr) | |
library(stringi) | |
library(stringr) | |
library(tidyr) | |
library(stringdist) | |
library(tokenizers) | |
##### Set up replicable workflow ##### | |
# Set the directories to be used. Both directories should exist in project directory, and dir_start should include text files needing to be processed. | |
dir_start <- "data/raw/" | |
dir_end <- "data/processed/" | |
##### Read and study documents ##### | |
# This function loads a file as a vector of lines. | |
get_lines <- function(file) { | |
file |> | |
scan(what = character(), | |
sep = "\n", | |
quiet = TRUE) | |
} | |
# This basic function splits lines at each space. It's good for keeping punctuation and nonstandard characters. | |
get_words_preliminary <- function(textlines) { | |
textlines |> | |
stri_trans_general("latin-ascii") |> | |
strsplit(" ") | |
} | |
# This better function converts words to lowercase and strips punctuation. It's good for ignoring nonstandard characters. | |
get_words <- function(textlines) { | |
textlines |> | |
stri_trans_general("latin-ascii") |> | |
tokenize_words() | |
} | |
# This function simplifies words to frequency counts. | |
get_table <- function(textwords) { | |
the_table <- textwords |> | |
unlist() |> | |
table() |> | |
data.frame() |> | |
arrange(desc(Freq)) | |
the_table$Var1 <- as.character(the_table$Var1) | |
colnames(the_table)[1] <- "Word" | |
return(the_table) | |
} | |
# This function combines all of the above, keeping punctuation and nonstandard characters. | |
get_process_preliminary <- function(file) { | |
file |> | |
get_lines() |> | |
get_words_preliminary() |> | |
get_table() | |
} | |
# This function combines all of the above, ignoring nonstandard characters. | |
get_process <- function(file) { | |
file |> | |
get_lines() |> | |
get_words() |> | |
get_table() | |
} | |
# This function gets counts of all nonstandard characters. | |
get_bad_characters <- function(list) { | |
list |> | |
unlist() |> | |
# Ignore all letters, numbers, and most punctuation. | |
gsub("[a-zA-Z,.:0-9?';() -]+","",x=_) |> | |
gsub("\\]","",x=_) |> | |
gsub("\\[","",x=_) |> | |
strsplit("") |> | |
unlist() |> | |
table() | |
} | |
# Understand words in context | |
get_context <- function(word, | |
file=NA, | |
window = 3, | |
max = 3, | |
use_cleaned = FALSE, | |
fuzzy = FALSE, | |
try_fuzzy = TRUE, | |
check_lower = TRUE, | |
crayon = FALSE) { | |
if (use_cleaned) { | |
get_the_words <- function(...) { | |
get_words(...) | |
} | |
} else { | |
get_the_words <- function(...) { | |
get_words_preliminary(...) | |
} | |
} | |
if (!is.na(file)) { | |
the_words <- file |> | |
get_lines() |> | |
get_the_words() |> | |
unlist() | |
} else { | |
the_words <- the_files |> | |
lapply(get_lines) |> | |
lapply(get_the_words) |> | |
unlist() | |
} | |
if (fuzzy) { | |
# The "fuzzy" options allows for regular expressions and partial matches. | |
index_word <- which(grepl(word, the_words)) | |
} else { | |
index_word <- which(the_words==word) | |
} | |
# If there are no matches, try it in a way that ignores case sensitivity. | |
if (length(index_word)==0 & !fuzzy & check_lower) { | |
index_word <- which(tolower(the_words)==tolower(word)) | |
} else if (length(index_word)==0 & fuzzy & check_lower) { | |
index_word <- which(grepl(tolower(word), tolower(the_words))) | |
} | |
if (length(index_word)==0 & !fuzzy & try_fuzzy & check_lower) { | |
index_word <- which(grepl(tolower(word), tolower(the_words))) | |
} | |
if (length(index_word)==0) { | |
warning(call. = FALSE, | |
paste0("The word '", | |
word, | |
"' isn't present.")) | |
} else if (length(index_word)>=max) { | |
printit <- rep(NA, max) | |
for (l in 1:max) { | |
index_prephrase <- seq(index_word[l] - window, | |
index_word[l] - 1) | |
index_postphrase <- seq(index_word[l] + 1, | |
index_word[l] + window) | |
printit[l] <- the_words[index_prephrase] |> | |
c(toupper(the_words[index_word[l]]), | |
the_words[index_postphrase]) |> | |
paste(collapse = " ") | |
if(crayon) { | |
message(crayon::black(paste(the_words[index_prephrase], collapse = " ")), | |
" ", crayon::blue(the_words[index_word[l]]), " ", | |
crayon::black(paste(the_words[index_postphrase], collapse = " "))) | |
} | |
} | |
if(!crayon){printit} | |
} else { | |
printit <- rep(NA, length(index_word)) | |
for (l in 1:length(index_word)) { | |
index_prephrase <- seq(index_word[l] - window, | |
index_word[l] - 1) | |
index_postphrase <- seq(index_word[l] + 1, | |
index_word[l] + window) | |
printit[l] <- the_words[index_prephrase] |> | |
c(toupper(the_words[index_word[l]]), | |
the_words[index_postphrase]) |> | |
paste(collapse = " ") | |
if(crayon) { | |
message(crayon::black(paste(the_words[index_prephrase], collapse = " ")), | |
" ", crayon::blue(the_words[index_word[l]]), " ", | |
crayon::black(paste(the_words[index_postphrase], collapse = " "))) | |
} | |
} | |
if(!crayon){printit} | |
} | |
} | |
##### Understand and distinguish "good" and "bad" words ##### | |
# Combine word frequencies among all documents in the corpus | |
# This is helpful to see which words are most common (i.e. legitimate) | |
combine_freqs <- function(words = words_list) { | |
for (i in 1:length(words)) { | |
words[[i]] <- words[[i]] |> | |
mutate(Freq = Freq/sum(Freq)) | |
colnames(words[[i]])[2] <- paste0("Freq", i) | |
} | |
combo_df <- words[[1]] | |
for (i in 2:length(words)) { | |
combo_df <- combo_df |> | |
full_join(words[[i]], by = "Word") | |
} | |
combo_df |> | |
pivot_longer(-Word, values_to = "frequency") |> | |
mutate(frequency = replace_na(frequency, 0)) |> | |
group_by(Word) |> | |
summarize(Average_Frequency = mean(frequency, | |
na.rm=TRUE)) |> | |
arrange(desc(Average_Frequency)) | |
} | |
# Set up and detect words needing to be cleaned | |
setup_messiness <- function(prelim = preliminary_list) { | |
# The messy_words are derived from the raw texts with some minor cleaning | |
messy_words <- prelim |> | |
bind_rows() |> | |
pull(Word) |> | |
strsplit("\t") |> | |
unlist() |> | |
strsplit("-") |> | |
unlist() |> | |
gsub("^'t","t",x = _) |> | |
gsub("[,.0-9:;?]","",x = _) |> | |
gsub("\\)","",x = _) |> | |
gsub("\\(","",x = _) |> | |
gsub("\\]","",x = _) |> | |
gsub("\\[","",x = _) |> | |
table() |> | |
data.frame() |> | |
pull(Var1) | |
# What's left behind are all the words with nonstandard characters | |
variant_spellings <- setdiff(messy_words |> tolower(), the_freqs$Word) | |
# Finding the placement of these words will make it possible to see their contexts: | |
variant_index <- | |
grep(paste0("^", variant_spellings, | |
"$", collapse="|"), | |
messy_words |> tolower()) | |
# Make these objects available in the global environment | |
messy_words <<- messy_words | |
variant_spellings <<- variant_spellings | |
variant_index <<- variant_index | |
} | |
# Automate converting bullets to regular expressions | |
bullet2regex <- function(string) { | |
splitup <- string |> | |
strsplit("") |> | |
unlist() |> | |
rle() |> | |
{\(x) setNames(x$lengths, x$values)}() | |
the_regex <- c() | |
for (i in 1:length(splitup)){ | |
if (grepl("[a-z]", names(splitup)[i])) { | |
the_regex[i] <- names(splitup)[i] |> | |
rep(unname(splitup[i])) |> | |
paste0(collapse = "") | |
} else if (grepl("[•▪]", names(splitup)[i])) { | |
the_regex[i] <- paste0("[a-z]{", | |
unname(splitup[i]), | |
"}") | |
} else { | |
the_regex[i] <- "" | |
} | |
} | |
the_regex_2 <- the_regex |> | |
paste0(collapse = "") | |
the_regex_3 <- paste0("^", the_regex_2, "$") | |
return(the_regex_3) | |
} | |
match_regex <- function(string) { | |
the_freqs$Word[grep(bullet2regex(string),the_freqs$Word)] |> | |
paste0(collapse = ", ") | |
} | |
##### Train corrections ##### | |
# Set up messiness and training data frame | |
setup_training <- function() { | |
# First, set up messiness | |
setup_messiness() | |
train_df <- | |
data.frame(index = variant_index, | |
variants = variant_spellings, | |
variant_printed = messy_words[variant_index], | |
guess1 = the_freqs$Word[amatch(variant_spellings, the_freqs$Word, maxDist = 1)], | |
guess2 = the_freqs$Word[amatch(variant_spellings, the_freqs$Word, maxDist = 1, method = "soundex")]) |> | |
filter(variants != "") |> | |
rowwise() |> | |
mutate(matches = variants |> match_regex(), | |
manual = NA, | |
guess2 = case_when( | |
guess2 == "the" ~ NA |> as.character(), | |
TRUE ~ guess2 | |
)) |> | |
mutate( | |
guess1 = case_when( | |
is.na(guess1) & !is.na(matches) ~ matches |> | |
strsplit(",") |> | |
unlist() |> | |
{\(x) x[1]}() |> | |
trimws(), | |
TRUE ~ guess1)) |> | |
mutate( | |
guess2 = case_when( | |
is.na(guess2) & !is.na(guess1) & !is.na(matches) ~ matches |> | |
strsplit(",") |> | |
unlist() |> | |
setdiff(guess1) |> | |
{\(x) x[1]}() |> | |
trimws(), | |
TRUE ~ guess2)) |> | |
filter(grepl("[a-z]", variants)) | |
# Make available to global environment | |
train_df <<- train_df | |
} | |
# Step through corrections | |
make_corrections <- function(n = NA, examples = 1) { | |
df <- train_df | |
uncorrected_rows <- sum(is.na(df$manual)) | |
# If no n is provided, set to number of uncorrected rows | |
if (is.na(n)) { | |
n <- uncorrected_rows | |
} | |
# Limit n to number of uncorrected rows | |
if (n > uncorrected_rows) { | |
n <- uncorrected_rows | |
} | |
if (n == 0){ | |
message("All done!") | |
return(df) | |
} | |
rows <- which(is.na(df$manual))[1:n] | |
for (r in rows) { | |
message(crayon::bold(crayon::black("Word: \n")), | |
crayon::black(" - "), df$variants[r]) | |
message(crayon::bold(crayon::black("Context: \n")), | |
paste0(crayon::black(" - "), | |
get_context(df$variants[r], | |
window=3, | |
max=examples) |> paste0(collapse="\n - ")) |> | |
{\(x) gsub(df$variants[r], | |
toupper(df$variants[r]), | |
x)}()) | |
message(crayon::bold(crayon::black("Matches: \n")), | |
crayon::black(" - "), df$matches[r]) | |
message(crayon::bold(crayon::black("Guesses: \n")), | |
crayon::black("1 - "), df$guess1[r], "\n", | |
crayon::black("2 - "), df$guess2[r], "\n", | |
crayon::black("3 - (keep it as "), df$variants[r], crayon::black(")\n"), | |
crayon::black(crayon::italic("(Or type another word.)"))) | |
correction <- readline(prompt="Correction: ") | |
if (correction == 1) { | |
correction <- df$guess1[r] | |
} else if (correction == 2) { | |
correction <- df$guess2[r] | |
} else if (correction == 3) { | |
correction <- df$variants[r] | |
} | |
df$manual[r] <- correction | |
} | |
# Save to global environment | |
train_df <<- df | |
} | |
##### Apply and process files ##### | |
# Use train_df to replace errors | |
make_replacements <- function(text, replacement_set = train_df){ | |
replacer <- replacement_set |> | |
filter(variants != manual) |> | |
pull(manual) |> | |
setNames(replacement_set |> | |
filter(variants != manual) |> | |
pull(variant_printed)) | |
str_replace_all(text, replacer) | |
} | |
# Strip EEBO references to pages and images | |
clean_pages <- function(text){ | |
text |> | |
stri_replace_all_regex( | |
"Unnumbered page", | |
"") |> | |
stri_replace_all_regex( | |
"View Image - Image [0-9]+ \\(unnumbered page\\)", | |
"") |> | |
stri_replace_all_regex( | |
"Image [0-9]+ \\(unnumbered page\\)", | |
"") |> | |
stri_replace_all_regex( | |
"View Image - Image [0-9]+ \\(page [0-9]+\\)", | |
"") |> | |
stri_replace_all_regex( | |
"Image [0-9]+ \\(page [0-9]+\\)", | |
"") |> | |
stri_replace_all_regex( | |
"^Page [0-9]+$", | |
"") | |
} | |
# Clean other special characters and footnote references | |
clean_special <- function(text){ | |
text |> | |
gsub("[¶/&•<>▪◊]"," ",x=_) |> | |
gsub("\\[[0-9]\\]"," ",x=_) |> | |
gsub("[ ]{2,}"," ",x=_) |> | |
trimws() | |
} | |
# Apply functions to file and save to dir_end | |
process_file <- function(file) { | |
new_path <- paste0(dir_end, basename(file)) | |
file |> | |
get_lines() |> | |
make_replacements() |> | |
clean_pages() |> | |
clean_special() |> | |
writeLines(new_path) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment