Last active
July 12, 2017 13:40
-
-
Save tjmahr/1fbcf3cd6158372d3e4a9a6148acb3a2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("stringr") | |
# # Maybe unescape brackets? | |
# str_replace("\\[", "\\\\\\[", "[") | |
# str_replace_all(lines_to_check[139], "\\\\\\[", "[") | |
# # enforce_nonbreaking? | |
# lines_to_check %>% | |
# str_replace_all(" = ", " = ") %>% | |
# str_replace_all("(Table|Figure) (\\d)", "\\1 \\2") | |
clean_md_file <- function(file_path) { | |
# report | |
report <- readr::read_lines(file_path) | |
# Ignore the bibliography | |
line_references_start <- report %>% | |
str_detect("^References$") %>% | |
which | |
# Say the bibliography is last line if it's missing | |
if (length(line_references_start) == 0) { | |
line_references_start <- length(report) | |
} | |
main_lines <- report[seq_len(line_references_start)] | |
# Ignore lines that aren't prose | |
table_lines <- main_lines %>% str_detect("^[|]") | |
img_lines <- main_lines %>% str_detect("^[<img]") | |
header_lines <- main_lines %>% str_detect("^[-][-]|^[#]") | |
do_not_touch <- table_lines | img_lines | header_lines | |
lines_to_check <- main_lines[!do_not_touch] | |
main_lines[!do_not_touch] <- lines_to_check %>% | |
replace_inline_amper %>% | |
replace_hyphen_space | |
report[seq_len(line_references_start)] <- main_lines | |
report | |
} | |
replace_inline_amper <- function(text) { | |
# Assume that an inline citation consists of an author's last name followed by a | |
# parenthesized year. If we find those, we fix the inline citations with | |
# ampersands. | |
# Last names are letters and hyphens and spaces. | |
re_author <- "[[:alpha:]- ]+" | |
re_inline_year <- "[(]\\d{4}[)]" | |
re_author_year <- sprintf("(%s %s)", re_author, re_inline_year) | |
# Allow a comma for when 3 or more authors | |
re_maybe_comma <- "(,?)" | |
re_amper <- "( & )" | |
re_ampersand_author_year <- sprintf("%s(?=%s)", re_amper, re_author_year) | |
str_replace_all(text, re_ampersand_author_year, " and ") | |
# c("Maggie & Lisa (2005) found...", | |
# "...have been found (Maggie & Lisa, 2005)", | |
# "Jones & Hyphen-Name (2005) found...", | |
# "...have been found (Jones & Hyphen-Name, 2005)", | |
# "Marge, Maggie, & Lisa (2005) found...", | |
# "...have been found (Marge, Maggie, & Lisa, 2005)", | |
# "Jones & Space Name (2005) found...", | |
# "...have been found (Jones & Space Name, 2005)") %>% | |
# replace_inline_amper() -> text | |
} | |
replace_hyphen_space <- function(text) { | |
# text <- c("word- learning", "x- and y- centered", "five- andersons") | |
# Use a negative look-ahead to skip "x- and y-examples" | |
re_hypen_space <- "(\\w+)- (?!and )(\\w+)" | |
text %>% | |
str_replace_all(re_hypen_space, "\\1-\\2") %>% | |
str_replace_all("\\d- SD", "\\1-SD") | |
} | |
# file_path <- "./reports/09_appendix.md" | |
# stringi::stri_write_lines(clean_md_file(file_path), file_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment