-
-
Save vdikan/6d20ea4a4749367679df2593ef95cb32 to your computer and use it in GitHub Desktop.
Bibtex parser using parser combinators in Common Lisp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defpackage bibtex | |
(:use :cl :parser-combinators :alexandria) | |
(:export parse) | |
(:documentation | |
"Parse bibtex files. | |
The current state is quite fragile and didn't go through any extensive testing. | |
There is one external function: PARSE.")) | |
(in-package bibtex) | |
(def-cached-parser bib-file? | |
(sepby? (bib-entry?) | |
(white?))) | |
;; Parse one bib-entry | |
(def-cached-parser bib-entry? | |
(named-seq? | |
;; Parse the type | |
(<- type (bib-entry-type?)) | |
;; Some whitespace and an opening brace | |
(white?) "{" (white?) | |
;; First comes the name of the entry, followed by a comma | |
(<- name (bib-name?)) | |
(white?) "," (white?) | |
;; Now parse all entries | |
(<- entries (many? (bib-entry-item?))) | |
"}" (white?) | |
(append (list type name) entries))) | |
;; Zero or more whitespace | |
(def-cached-parser white? | |
(between? (sat (rcurry #'member '(#\Space #\Newline #\Tab))) | |
nil nil 'null)) | |
;; Parse the name of a bib-entry, i.e., up until whitespace or comma | |
(def-cached-parser bib-name? | |
(gather-if-not* (rcurry 'member '(#\Space #\Newline #\Tab #\, #\})))) | |
;; Parse the type of a bib-entry, i.e., something starting with @ and then a word | |
(def-cached-parser bib-entry-type? | |
(named-seq? | |
"@" | |
(<- type (word?)) | |
type)) | |
;; Parse one item in a bib-entry, i.e., something with a name, a "=" and then a | |
;; more or less compilcated entry | |
(def-cached-parser bib-entry-item? | |
(named-seq? | |
(<- name (word?)) | |
(white?) "=" (white?) | |
(<- entry (choice (word?) (bib-entry-value?))) | |
(white?) (opt? ",") (white?) | |
(list (intern (string-upcase name) :keyword) entry))) | |
(def-cached-parser bib-entry-value? | |
(bracket? | |
"{" | |
(bib-entry-value-rec?) | |
"}")) | |
;; Turn a character into a string; leave strings alone | |
(defun assert-string (x) | |
(if (stringp x) | |
x | |
(make-string 1 :initial-element x))) | |
;; most complicated parser | |
;; Problem: the value of a bib-item it delimited by { and } | |
;; bit it can also contain those. So we need a recursive parser here | |
(def-cached-parser bib-entry-value-rec? | |
(named-seq? | |
;; Parse the contents into a tree of strings and characters | |
(<- tree | |
(many? | |
;; Either a normal string, or something that starts and ends with "{" and "}" | |
(choice | |
;; String | |
(gather-if-not* (rcurry 'member '(#\{ #\}))) | |
;; Recursive bib-entry | |
(seq-list? | |
"{" | |
(bib-entry-value-rec?) | |
"}")))) | |
(progn | |
(apply 'concatenate 'string (mapcar 'assert-string (flatten tree)))))) | |
(defun parse (string) | |
"Parse the bibtext content from string. Return result as list of entries. | |
An entry is a list. | |
The first element of that list contains the type | |
of the entry as a string. | |
The second element of that list is the name of that entry. | |
The third item of that list is an alist. | |
The keys are symbols in the keyword-package, parsed from the keys of the bib-entries. | |
The values are simple strings." | |
(parse-string* (bib-file?) string)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment