Created
March 10, 2018 00:33
-
-
Save fauxneticien/bab8abd6693e388c9b68b48258ec8010 to your computer and use it in GitHub Desktop.
Wrangle PDF page headings into a data frame
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| options(stringsAsFactors = FALSE) | |
| library(tidyverse) | |
| library(pdftools) | |
| library(purrr) | |
| library(zoo) | |
| library(readr) | |
| headers_df <- | |
| # read 406-page PDF of lexicon (i.e. no front matter, reversal list, etc.) | |
| # where there is first and last word written at the top of the page | |
| pdf_text(pdf = "ngk-lex-only.pdf") %>% | |
| # from each page, return first line (i.e. the header). we expect output to look like: | |
| # "w ard ak k id j w a rd w a rd n a m", i.e. with at most 1 space within words, | |
| # and at least 2 spaces between words | |
| map(~ str_split(string = ., pattern = "\\n") %>% unlist() %>% .[1]) %>% | |
| map(~ str_replace_all(string = ., pattern = "\\s{2,}", replacement = "@") %>% | |
| # "w ard ak k id j@w a rd w a rd n a m" | |
| str_split("@") %>% | |
| unlist() %>% | |
| # c("w ard ak k id j", "w a rd w a rd n a m") | |
| str_replace_all(pattern = "\\s", replacement = "") %>% | |
| str_replace_all(pattern = "ü", replacement = "û") %>% | |
| str_replace_all(pattern = "~", replacement = " ~ ") %>% | |
| str_replace_all(pattern = "\\.\\.\\.", replacement = " \\.\\.\\. ") | |
| # c("wardakkidj", "wardwardnam") | |
| ) %>% | |
| map_df(~ data.frame(first_word = .[1],last_word = .[2])) %>% | |
| mutate(page = 1:n()) %>% | |
| select(page, first_word, last_word) %>% | |
| mutate( | |
| first_word = ifelse(first_word == "bark", "bark1", first_word), | |
| first_word = ifelse(first_word == "dja-", "dja-1", first_word), | |
| first_word = ifelse(first_word == "maria", "marla", first_word), | |
| first_word = ifelse(first_word == "maria", "marla", first_word), | |
| first_word = ifelse(first_word == "mey", "mey1", first_word) | |
| ) | |
| lexicon_df <- | |
| readLines("dalabon_dict_orig_2003.txt") %>% | |
| tibble(line = 1:length(.), | |
| data = .) %>% | |
| extract(col = data, | |
| into = c("code", "value"), | |
| regex = "^\\\\([a-z]+) (.*)$") %>% | |
| mutate(lx_id = ifelse(code == "lx", line, NA) %>% na.locf(na.rm = FALSE)) | |
| headwords <- | |
| lexicon_df %>% | |
| filter(code %in% c("lx", "hm")) %>% | |
| select(-line) %>% | |
| group_by(lx_id) %>% | |
| spread(key = code, value = value) %>% | |
| mutate(hm = ifelse(is.na(hm), "", hm), | |
| lx = str_replace_all(lx, "ž", "û") %>% str_trim(), | |
| lookup = paste0(lx, hm) %>% | |
| str_replace_all("\\s+", "") %>% | |
| str_replace_all("\\.\\.\\.", " \\.\\.\\. ") %>% | |
| str_replace_all("~", " ~ "), | |
| lookup = ifelse(!lookup %in% c("-kadji", "-kan", "-ki"), | |
| str_replace(lookup, "^-", ""), | |
| lookup) | |
| ) | |
| missing_lookups <- | |
| filter(headers_df, !first_word %in% headwords$lookup) | |
| if(nrow(missing_lookups) > 0) { stop("Error: not all headwords from title page found in lexicon") } | |
| headwords %>% | |
| ungroup %>% | |
| arrange(lx_id) %>% | |
| left_join(select(headers_df, lookup = first_word, page)) %>% | |
| mutate(headword = paste0(lx, hm)) %>% | |
| mutate(page = na.locf(page, na.rm = FALSE)) %>% | |
| select(headword, page) %>% | |
| View # Have write_csv commented out by default | |
| # write_csv("headword_index.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment