Skip to content

Instantly share code, notes, and snippets.

@fauxneticien
Created March 10, 2018 00:33
Show Gist options
  • Save fauxneticien/bab8abd6693e388c9b68b48258ec8010 to your computer and use it in GitHub Desktop.
Save fauxneticien/bab8abd6693e388c9b68b48258ec8010 to your computer and use it in GitHub Desktop.
Wrangle PDF page headings into a data frame
options(stringsAsFactors = FALSE)
library(tidyverse)
library(pdftools)
library(purrr)
library(zoo)
library(readr)
headers_df <-
# read 406-page PDF of lexicon (i.e. no front matter, reversal list, etc.)
# where there is first and last word written at the top of the page
pdf_text(pdf = "ngk-lex-only.pdf") %>%
# from each page, return first line (i.e. the header). we expect output to look like:
# "w ard ak k id j w a rd w a rd n a m", i.e. with at most 1 space within words,
# and at least 2 spaces between words
map(~ str_split(string = ., pattern = "\\n") %>% unlist() %>% .[1]) %>%
map(~ str_replace_all(string = ., pattern = "\\s{2,}", replacement = "@") %>%
# "w ard ak k id j@w a rd w a rd n a m"
str_split("@") %>%
unlist() %>%
# c("w ard ak k id j", "w a rd w a rd n a m")
str_replace_all(pattern = "\\s", replacement = "") %>%
str_replace_all(pattern = "ü", replacement = "û") %>%
str_replace_all(pattern = "~", replacement = " ~ ") %>%
str_replace_all(pattern = "\\.\\.\\.", replacement = " \\.\\.\\. ")
# c("wardakkidj", "wardwardnam")
) %>%
map_df(~ data.frame(first_word = .[1],last_word = .[2])) %>%
mutate(page = 1:n()) %>%
select(page, first_word, last_word) %>%
mutate(
first_word = ifelse(first_word == "bark", "bark1", first_word),
first_word = ifelse(first_word == "dja-", "dja-1", first_word),
first_word = ifelse(first_word == "maria", "marla", first_word),
first_word = ifelse(first_word == "maria", "marla", first_word),
first_word = ifelse(first_word == "mey", "mey1", first_word)
)
lexicon_df <-
readLines("dalabon_dict_orig_2003.txt") %>%
tibble(line = 1:length(.),
data = .) %>%
extract(col = data,
into = c("code", "value"),
regex = "^\\\\([a-z]+) (.*)$") %>%
mutate(lx_id = ifelse(code == "lx", line, NA) %>% na.locf(na.rm = FALSE))
headwords <-
lexicon_df %>%
filter(code %in% c("lx", "hm")) %>%
select(-line) %>%
group_by(lx_id) %>%
spread(key = code, value = value) %>%
mutate(hm = ifelse(is.na(hm), "", hm),
lx = str_replace_all(lx, "ž", "û") %>% str_trim(),
lookup = paste0(lx, hm) %>%
str_replace_all("\\s+", "") %>%
str_replace_all("\\.\\.\\.", " \\.\\.\\. ") %>%
str_replace_all("~", " ~ "),
lookup = ifelse(!lookup %in% c("-kadji", "-kan", "-ki"),
str_replace(lookup, "^-", ""),
lookup)
)
missing_lookups <-
filter(headers_df, !first_word %in% headwords$lookup)
if(nrow(missing_lookups) > 0) { stop("Error: not all headwords from title page found in lexicon") }
headwords %>%
ungroup %>%
arrange(lx_id) %>%
left_join(select(headers_df, lookup = first_word, page)) %>%
mutate(headword = paste0(lx, hm)) %>%
mutate(page = na.locf(page, na.rm = FALSE)) %>%
select(headword, page) %>%
View # Have write_csv commented out by default
# write_csv("headword_index.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment