fauxneticien · March 10, 2018 00:33
diff --git a/pdf_wrangler.R b/pdf_wrangler.R
 options(stringsAsFactors = FALSE)

 library(tidyverse)
 library(pdftools)
 library(purrr)
 library(zoo)
 library(readr)

 headers_df <-
    # read 406-page PDF of lexicon (i.e. no front matter, reversal list, etc.)
    # where there is first and last word written at the top of the page
    pdf_text(pdf = "ngk-lex-only.pdf") %>% 

    # from each page, return first line (i.e. the header). we expect output to look like:
    # "w ard ak k id j        w a rd w a rd n a m", i.e. with at most 1 space within words,
    # and at least 2 spaces between words
    map(~ str_split(string = ., pattern = "\\n") %>% unlist() %>% .[1]) %>%
    
    map(~ str_replace_all(string      = ., pattern     = "\\s{2,}", replacement = "@") %>%
          # "w ard ak k id j@w a rd w a rd n a m"

          str_split("@") %>%
          unlist() %>%
          # c("w ard ak k id j", "w a rd w a rd n a m")
            
          str_replace_all(pattern = "\\s", replacement = "") %>%
          str_replace_all(pattern = "ü", replacement = "û") %>%
          str_replace_all(pattern = "~", replacement = " ~ ") %>%
          str_replace_all(pattern = "\\.\\.\\.", replacement = " \\.\\.\\. ")
          # c("wardakkidj", "wardwardnam")
    ) %>%

    map_df(~ data.frame(first_word = .[1],last_word  = .[2])) %>%
    mutate(page = 1:n()) %>%
    select(page, first_word, last_word) %>%
    mutate(
        first_word = ifelse(first_word == "bark", "bark1", first_word),
        first_word = ifelse(first_word == "dja-", "dja-1", first_word),
        first_word = ifelse(first_word == "maria", "marla", first_word),
        first_word = ifelse(first_word == "maria", "marla", first_word),
        first_word = ifelse(first_word == "mey", "mey1", first_word)
    )

 lexicon_df <-
    readLines("dalabon_dict_orig_2003.txt") %>% 
    
    tibble(line = 1:length(.),
           data = .) %>%
    
    extract(col   = data,
            into  = c("code", "value"),
            regex = "^\\\\([a-z]+) (.*)$") %>%
    
    mutate(lx_id = ifelse(code == "lx", line, NA) %>% na.locf(na.rm = FALSE))

 headwords <-
    lexicon_df %>% 
    filter(code %in% c("lx", "hm")) %>%
    select(-line) %>% 
    group_by(lx_id) %>%
    spread(key = code, value = value) %>%
    mutate(hm     = ifelse(is.na(hm), "", hm),
           lx     = str_replace_all(lx, "ž", "û") %>% str_trim(),
           lookup = paste0(lx, hm) %>%
                    str_replace_all("\\s+", "") %>%
                    str_replace_all("\\.\\.\\.", " \\.\\.\\. ") %>%
                    str_replace_all("~", " ~ "),
           lookup = ifelse(!lookup %in% c("-kadji", "-kan", "-ki"),
                           str_replace(lookup, "^-", ""),
                           lookup)
    )

 missing_lookups <- 
    filter(headers_df, !first_word %in% headwords$lookup)

 if(nrow(missing_lookups) > 0) { stop("Error: not all headwords from title page found in lexicon") }

 headwords %>%
    ungroup %>% 
    arrange(lx_id) %>%
    left_join(select(headers_df, lookup = first_word, page)) %>% 
    mutate(headword = paste0(lx, hm)) %>% 
    mutate(page = na.locf(page, na.rm = FALSE)) %>%
    select(headword, page) %>%
    View # Have write_csv commented out by default
    # write_csv("headword_index.csv")
	options(stringsAsFactors = FALSE)

	library(tidyverse)
	library(pdftools)
	library(purrr)
	library(zoo)
	library(readr)

	headers_df <-
	# read 406-page PDF of lexicon (i.e. no front matter, reversal list, etc.)
	# where there is first and last word written at the top of the page
	pdf_text(pdf = "ngk-lex-only.pdf") %>%

	# from each page, return first line (i.e. the header). we expect output to look like:
	# "w ard ak k id j w a rd w a rd n a m", i.e. with at most 1 space within words,
	# and at least 2 spaces between words
	map(~ str_split(string = ., pattern = "\\n") %>% unlist() %>% .[1]) %>%

	map(~ str_replace_all(string = ., pattern = "\\s{2,}", replacement = "@") %>%
	# "w ard ak k id j@w a rd w a rd n a m"

	str_split("@") %>%
	unlist() %>%
	# c("w ard ak k id j", "w a rd w a rd n a m")

	str_replace_all(pattern = "\\s", replacement = "") %>%
	str_replace_all(pattern = "ü", replacement = "û") %>%
	str_replace_all(pattern = "~", replacement = " ~ ") %>%
	str_replace_all(pattern = "\\.\\.\\.", replacement = " \\.\\.\\. ")
	# c("wardakkidj", "wardwardnam")
	) %>%

	map_df(~ data.frame(first_word = .[1],last_word = .[2])) %>%
	mutate(page = 1:n()) %>%
	select(page, first_word, last_word) %>%
	mutate(
	first_word = ifelse(first_word == "bark", "bark1", first_word),
	first_word = ifelse(first_word == "dja-", "dja-1", first_word),
	first_word = ifelse(first_word == "maria", "marla", first_word),
	first_word = ifelse(first_word == "maria", "marla", first_word),
	first_word = ifelse(first_word == "mey", "mey1", first_word)
	)

	lexicon_df <-
	readLines("dalabon_dict_orig_2003.txt") %>%

	tibble(line = 1:length(.),
	data = .) %>%

	extract(col = data,
	into = c("code", "value"),
	regex = "^\\\\([a-z]+) (.*)$") %>%

	mutate(lx_id = ifelse(code == "lx", line, NA) %>% na.locf(na.rm = FALSE))

	headwords <-
	lexicon_df %>%
	filter(code %in% c("lx", "hm")) %>%
	select(-line) %>%
	group_by(lx_id) %>%
	spread(key = code, value = value) %>%
	mutate(hm = ifelse(is.na(hm), "", hm),
	lx = str_replace_all(lx, "ž", "û") %>% str_trim(),
	lookup = paste0(lx, hm) %>%
	str_replace_all("\\s+", "") %>%
	str_replace_all("\\.\\.\\.", " \\.\\.\\. ") %>%
	str_replace_all("~", " ~ "),
	lookup = ifelse(!lookup %in% c("-kadji", "-kan", "-ki"),
	str_replace(lookup, "^-", ""),
	lookup)
	)

	missing_lookups <-
	filter(headers_df, !first_word %in% headwords$lookup)

	if(nrow(missing_lookups) > 0) { stop("Error: not all headwords from title page found in lexicon") }

	headwords %>%
	ungroup %>%
	arrange(lx_id) %>%
	left_join(select(headers_df, lookup = first_word, page)) %>%
	mutate(headword = paste0(lx, hm)) %>%
	mutate(page = na.locf(page, na.rm = FALSE)) %>%
	select(headword, page) %>%
	View # Have write_csv commented out by default
	# write_csv("headword_index.csv")