jmclawson · February 15, 2022 17:11 · jmclawson · Feb 15, 2022
diff --git a/import_bib.R b/import_bib.R
 # import_bib.R
 # To convert from Bibtex to a data frame for working with the data in R.

 library(dplyr)
 library(stringr)
 library(tidyr)

 # 0. Set filename for the bibfile
 the_bibfile <- "~/path/to/my.bib"

 # 1. read the bib file as a vector of lines
 bibdata <- scan(the_bibfile, 
                what="character", 
 # skip omits junk from header; uncomment and adjust if needed
                # skip = 4, 
 # nline limits reading to clear junk from footer; uncomment and adjust if needed
                # nlines = 2285,
                sep = "\n")

 # 2. Index the start and end of each entry
 startlines <- grep("^@",bibdata)
 endlines <-
  (startlines[2:length(startlines)]-1) |>
  c(length(bibdata))

 # 3. Get a list of cite keys in order.
 citekeys <- bibdata[startlines] |>
  str_remove("^@[a-z]*\\{") |>
  str_remove(",")

 # 4. Convert the whole vector into a table, with one row per entry
 get_bibitem <- function(data = bibdata, 
                        n){
  from <- startlines[n]
  to <- endlines[n]
  this_item <- data[from:to] |> 
    paste(collapse = "\n")
  
  return(this_item)
 }

 bib_vec <- c()
 for (i in 1:length(citekeys)) {
  bib_vec <- c(bib_vec, 
               get_bibitem(n=i))
 }

 bib_df <- data.frame(key = unname(citekeys),
                     item = bib_vec)

 # 5. Extract entrytype and tidy entries
 bib_df_old <- bib_df
 bib_df <- bib_df_old |> 
  mutate(entrytype = item |> 
           {\(x) 
             gsub("@([a-z]+).*", 
                  "\\1", x)}(),
         item = item |> 
           {\(x) 
             gsub("@[a-z]+\\{[a-zA-Z0-9:-]+,[\n](.*)", 
                  "\\1", x)}() |> 
           {\(x) 
             gsub(".{1}$", "", x)}()) |> 
  select(key, entrytype, item)

 # 6. Parse and unnest items, then pivot wider to columns
 bib_df_old2 <- bib_df
 bib_df <- bib_df_old2 |> 
  mutate(field = item |> 
           {\(x) gsub("\t", "", x)}() |> 
           strsplit("\n")) |> 
  select(key, entrytype, field) |> 
  unnest(field) |> 
  mutate(name = field |> 
 # next line presumes a spaced equal sign is always used between fieldname and data
           strsplit(" = ") |> 
           sapply(`[`, 1),
         value = field |> 
           strsplit(" = ") |> 
           sapply(`[`, 2) |> 
           {\(x) 
             gsub(",$", "", x)}()) |>
  arrange(name) |> 
  select(-field) |> 
  pivot_wider(id_cols = c(key, entrytype))
	# import_bib.R
	# To convert from Bibtex to a data frame for working with the data in R.

	library(dplyr)
	library(stringr)
	library(tidyr)

	# 0. Set filename for the bibfile
	the_bibfile <- "~/path/to/my.bib"

	# 1. read the bib file as a vector of lines
	bibdata <- scan(the_bibfile,
	what="character",
	# skip omits junk from header; uncomment and adjust if needed
	# skip = 4,
	# nline limits reading to clear junk from footer; uncomment and adjust if needed
	# nlines = 2285,
	sep = "\n")

	# 2. Index the start and end of each entry
	startlines <- grep("^@",bibdata)
	endlines <-
	(startlines[2:length(startlines)]-1) \|>
	c(length(bibdata))

	# 3. Get a list of cite keys in order.
	citekeys <- bibdata[startlines] \|>
	str_remove("^@[a-z]*\\{") \|>
	str_remove(",")

	# 4. Convert the whole vector into a table, with one row per entry
	get_bibitem <- function(data = bibdata,
	n){
	from <- startlines[n]
	to <- endlines[n]
	this_item <- data[from:to] \|>
	paste(collapse = "\n")

	return(this_item)
	}

	bib_vec <- c()
	for (i in 1:length(citekeys)) {
	bib_vec <- c(bib_vec,
	get_bibitem(n=i))
	}

	bib_df <- data.frame(key = unname(citekeys),
	item = bib_vec)

	# 5. Extract entrytype and tidy entries
	bib_df_old <- bib_df
	bib_df <- bib_df_old \|>
	mutate(entrytype = item \|>
	{\(x)
	gsub("@([a-z]+).*",
	"\\1", x)}(),
	item = item \|>
	{\(x)
	gsub("@[a-z]+\\{[a-zA-Z0-9:-]+,[\n](.*)",
	"\\1", x)}() \|>
	{\(x)
	gsub(".{1}$", "", x)}()) \|>
	select(key, entrytype, item)

	# 6. Parse and unnest items, then pivot wider to columns
	bib_df_old2 <- bib_df
	bib_df <- bib_df_old2 \|>
	mutate(field = item \|>
	{\(x) gsub("\t", "", x)}() \|>
	strsplit("\n")) \|>
	select(key, entrytype, field) \|>
	unnest(field) \|>
	mutate(name = field \|>
	# next line presumes a spaced equal sign is always used between fieldname and data
	strsplit(" = ") \|>
	sapply(`[`, 1),
	value = field \|>
	strsplit(" = ") \|>
	sapply(`[`, 2) \|>
	{\(x)
	gsub(",$", "", x)}()) \|>
	arrange(name) \|>
	select(-field) \|>
	pivot_wider(id_cols = c(key, entrytype))