Last active
February 15, 2022 17:11
-
-
Save jmclawson/def66ac8635db9c6131c3f3ae092f6e5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import_bib.R | |
# To convert from Bibtex to a data frame for working with the data in R. | |
library(dplyr) | |
library(stringr) | |
library(tidyr) | |
# 0. Set filename for the bibfile | |
the_bibfile <- "~/path/to/my.bib" | |
# 1. read the bib file as a vector of lines | |
bibdata <- scan(the_bibfile, | |
what="character", | |
# skip omits junk from header; uncomment and adjust if needed | |
# skip = 4, | |
# nline limits reading to clear junk from footer; uncomment and adjust if needed | |
# nlines = 2285, | |
sep = "\n") | |
# 2. Index the start and end of each entry | |
startlines <- grep("^@",bibdata) | |
endlines <- | |
(startlines[2:length(startlines)]-1) |> | |
c(length(bibdata)) | |
# 3. Get a list of cite keys in order. | |
citekeys <- bibdata[startlines] |> | |
str_remove("^@[a-z]*\\{") |> | |
str_remove(",") | |
# 4. Convert the whole vector into a table, with one row per entry | |
get_bibitem <- function(data = bibdata, | |
n){ | |
from <- startlines[n] | |
to <- endlines[n] | |
this_item <- data[from:to] |> | |
paste(collapse = "\n") | |
return(this_item) | |
} | |
bib_vec <- c() | |
for (i in 1:length(citekeys)) { | |
bib_vec <- c(bib_vec, | |
get_bibitem(n=i)) | |
} | |
bib_df <- data.frame(key = unname(citekeys), | |
item = bib_vec) | |
# 5. Extract entrytype and tidy entries | |
bib_df_old <- bib_df | |
bib_df <- bib_df_old |> | |
mutate(entrytype = item |> | |
{\(x) | |
gsub("@([a-z]+).*", | |
"\\1", x)}(), | |
item = item |> | |
{\(x) | |
gsub("@[a-z]+\\{[a-zA-Z0-9:-]+,[\n](.*)", | |
"\\1", x)}() |> | |
{\(x) | |
gsub(".{1}$", "", x)}()) |> | |
select(key, entrytype, item) | |
# 6. Parse and unnest items, then pivot wider to columns | |
bib_df_old2 <- bib_df | |
bib_df <- bib_df_old2 |> | |
mutate(field = item |> | |
{\(x) gsub("\t", "", x)}() |> | |
strsplit("\n")) |> | |
select(key, entrytype, field) |> | |
unnest(field) |> | |
mutate(name = field |> | |
# next line presumes a spaced equal sign is always used between fieldname and data | |
strsplit(" = ") |> | |
sapply(`[`, 1), | |
value = field |> | |
strsplit(" = ") |> | |
sapply(`[`, 2) |> | |
{\(x) | |
gsub(",$", "", x)}()) |> | |
arrange(name) |> | |
select(-field) |> | |
pivot_wider(id_cols = c(key, entrytype)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The code assumes some peculiarities of the bib file: for example, that field names and data are separated by spaced equal-signs " = ", etc., but the code and comments should make it flexible enough to adapt.