Created
September 8, 2012 09:13
-
-
Save acardona/3672948 to your computer and use it in GitHub Desktop.
A parser for a subset of BibTeX files, written with clojure monads
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; Albert Cardona, 2012-09-08 | |
; http://albert.rierol.net/clojure-monads.html | |
(ns my.parse.bib5 | |
(:use [clojure.algo.monads :only [domonad with-monad state-t maybe-m fetch-state set-state m-seq m-plus m-result]]) | |
(:use [clojure.string :only [lower-case]]) | |
(:use [clojure.pprint :only [pprint]])) | |
(set! *warn-on-reflection* true) | |
(def parser-m (state-t maybe-m)) | |
(with-monad parser-m | |
; Primatives: the basic building blocks | |
(defn get-one | |
"Gets the next item from the input and returns it, | |
while also updating the state to be the sequence starting | |
at the next element." | |
[] | |
(domonad [input (fetch-state) | |
_ (set-state (next input))] | |
(first input))) | |
(def | |
^{:doc "Returns nil to signal that the end of the sequence has been reached, | |
otherwise fails in monadic way to indicate that the end can't be found here, | |
and therefore the parsing has to backtrack to try something else or fail altogether."} | |
eof | |
(domonad [remaining (fetch-state) | |
:when (nil? remaining)] | |
nil))) | |
(with-monad parser-m | |
; Basic, generic parsers, built on top of the primatives | |
(defn matching | |
"The most basic matching parser. Tests the next item | |
in the sequence against the predicate provided. If true, | |
returns the item, otherwise fails." | |
[pred] | |
(domonad [one (get-one) | |
:when (pred one)] | |
one)) | |
(defn one | |
"Next element matches x exactly. What this function does | |
is to invoke the matching function with a new anonymous function | |
that uses the equality operator to compare x with the next element." | |
[x] | |
(matching #(= x %))) | |
(defn not-one | |
"Next element will not match x. Enforces that the element to match | |
cannot be nil, which would get confused with the end of sequence." | |
[x] | |
(matching (fn [y] | |
(and (not= x y) | |
(not= nil y))))) | |
(defn one-of | |
"Matches any one item in the provided set. This function invokes | |
the matching function with the provided set as argument; the set | |
works as a predicate, because sets in clojure are functions of | |
their elements, that is, sets are also functions, which return | |
the stored element when given an equal element as argument." | |
[s] | |
(matching s)) | |
(defn not-one-of | |
"Matches any one item not in the provided set, and returns it. | |
To make this work the set cannot be used directly as predicate | |
of the matching function, but instead an anonymous function is | |
constructed that returns true when the element is not part of the set. | |
Enforces that the element to match cannot be nil, which would | |
get confused with the end of the sequence." | |
[s] | |
(matching (fn [y] | |
(and (nil? (s y)) | |
(not= nil y)))))) | |
(with-monad parser-m | |
; Combinators | |
(defn optional | |
"Return a parser that makes the given parser optional. | |
This is accomplished by using m-plus to combine two monadic | |
functions: the one provided (the parser) and also (m-result nil) | |
which signals a void, but valid monadic return value. If the | |
parser doesn't match, then (m-result nil) is returned, signaling | |
that the state machine did not advance to the next step." | |
[parser] | |
(m-plus parser (m-result nil))) | |
(defn one-or-more | |
"Matches the same parser one or more times until it fails, | |
then it returns a sequence of the matched results. Given | |
its recursive call, this function can overflow the stack | |
when positively matching sequences longer than the possible | |
stack depth. | |
First the parser is used to match the first item in the sequence, | |
and if successful, an optional recursive call is done to match | |
further consecutive items. Finally a flattened sequence is returned | |
with all matched items in order. | |
Given that the parser-m is a modification of the maybe-m, the | |
second operation will not be attempted unless the first operation | |
succeeded." | |
[parser] | |
(domonad [r parser | |
rs (optional (one-or-more parser))] | |
(if rs | |
(into [r] (flatten rs)) | |
[r]))) | |
(defn none-or-more | |
"Matches the same parser zero or more times until it fails, | |
then it returns a sequence of the matched results." | |
[parser] | |
(optional (one-or-more parser))) | |
(defn skip-one-or-more | |
"Matches the same parser one of more times until it fails, | |
then it returns true. Or nil if it doesn't match at least once. | |
Given its recursivity this function can overflow the stack. | |
This function works like one-or-more, except that it doesn't | |
bind neither return the matched values." | |
[parser] | |
(domonad [_ parser | |
_ (optional (skip-one-or-more parser))] | |
true)) | |
(defn skip-none-or-more | |
"Matches the same parser zero or more times until it fails, | |
then returns true." | |
[parser] | |
(optional (skip-one-or-more parser)))) | |
(with-monad parser-m | |
; Parser combinators | |
(defn match-one | |
"Match at least one of the given parsers, as evaluated in order, | |
or else fail. What this function does is to return a nested | |
set of functions of the state using m-plus. When executed, | |
when one matches the chain stops and the current matched item | |
or sequence of items is returned, according to the parser." | |
[& parsers] | |
(reduce m-plus parsers)) | |
(defn match-all | |
"Match all given parsers, else fail. Returns a flattened sequence | |
with all results. This is accomplished by generating a sequence of | |
nested functions which, when invoked with the state as argument, | |
thread the state altering it as each is invoked, while the results | |
are accumulated in a sequence." | |
[& parsers] | |
(m-bind (m-seq parsers) | |
(comp m-result flatten)))) | |
(with-monad parser-m | |
(let [la "abcdefghijklmnopqrstuvwxyz" | |
ua (set (.toUpperCase la)) | |
la (set la) | |
letters (into la ua) | |
ext-la "áéíóúàèìòùäëïöüâêîôûßçñµ" ; Add more at will | |
ext-ua (into ua (.toUpperCase ext-la)) | |
ext-la (into la ext-la) | |
ext-letters (into letters (into ext-la ext-ua)) | |
sp (set " \t\n\r") | |
numbers (set "1234567890") | |
symbols (set "'{}\\:;,.()[]$_-#@%^&*+=!?<>/|~`") ; lacks a " in purpose | |
non-syntax-symbols (set "':;,.()[]!@#$%^&*()-_+=<>?/|~`")] | |
(def | |
^{:doc "Match any whitespace character."} | |
whitespace (one-of sp)) | |
(def | |
^{:doc "Match any letter from the alphabet, in both lower and upper case."} | |
letter (one-of letters)) | |
(def | |
^{:doc "Match any upper-case letter."} | |
upper-case-letter (one-of ua)) | |
(def | |
^{:doc "Match any lower-case letter."} | |
lower-case-letter (one-of la)) | |
(def | |
^{:doc "Match any letter of the alphabet, in both lower and upper case, | |
and also letters with tilde, etc."} | |
ext-letter (one-of ext-letters)) | |
(def | |
^{:doc "Match any upper-case letter of the alphabet including letters | |
with tilde, umlauts, etc."} | |
ext-upper-case-letter (one-of ext-ua)) | |
(def | |
^{:doc "Match any numeric character."} | |
number (one-of numbers)) | |
(def | |
^{:doc "Match an allowed non-alphabetic character"} | |
non-letter (one-of symbols)) | |
(def | |
^{:doc "Match a symbol that is not part of the supported latex syntax."} | |
non-syntax-symbol (one-of non-syntax-symbols)) | |
(def | |
^{:doc "Match any amount of text with extended alphabetical or whitespace characters."} | |
plain-text | |
(one-or-more (match-one ext-letter | |
whitespace))) | |
(def | |
^{:doc "Match any character allowed inside the text of a property of a BibTeX entry."} | |
latex-char | |
(match-one ext-letter | |
whitespace | |
number | |
(match-all (one \\) (one \") letter) ; This is e.g. ä in latex: \"a | |
non-letter)))) | |
(with-monad parser-m | |
(def | |
^{:doc "One property of a BibTex entry, returned as e.g. {:author \"Albert Cardona\"}"} | |
property | |
(domonad [_ (skip-none-or-more whitespace) | |
prop-name (one-or-more letter) | |
_ (match-all | |
(skip-none-or-more whitespace) | |
(one \=) | |
(skip-none-or-more whitespace) | |
(one \")) | |
prop-value (none-or-more latex-char) | |
_ (match-all | |
(one \") | |
(skip-none-or-more whitespace) | |
(one \,))] | |
{(keyword (lower-case (apply str prop-name))) (apply str prop-value)})) | |
(def | |
^{:doc "One entry in the BibTeX file, returned as a one-entry map with | |
the alias as key and the properties as a map, e.g. {Cardona2012 {:author | |
\"Albert Cardona\" :year \"2012\" :journal \"Nature Methods\"}}"} | |
entry | |
(domonad [_ (skip-none-or-more whitespace) | |
_ (one \@) | |
kind (one-or-more letter) ; article, book, inproceedings, etc. | |
_ (one \{) | |
_ (none-or-more whitespace) | |
alias (one-or-more (not-one \,)) | |
_ (one \,) | |
ps (none-or-more property) | |
_ (none-or-more whitespace) | |
_ (one \})] | |
{(apply str alias) (merge {:kind (lower-case (apply str kind))} | |
(apply merge ps))})) | |
(def | |
^{:doc "A map of all entries in the BibTeX file, each with the alias | |
as key and the map of properties as value."} | |
entries | |
(domonad [es (none-or-more entry)] | |
(apply merge es)))) | |
; Test 1: correctness | |
(comment | |
(let [state (slurp (str (System/getProperty "user.home") | |
\/ | |
"webs/trakem2/trakem2_citations.bib")) | |
[es state] (entries (lazy-seq state))] | |
(doseq [[k v] (take 2 es)] | |
(println k) ; the alias, e.g. Sprecher2011 | |
(pprint v) ; the map of properties of this entry | |
(.flush *out*))) ; the pretty-printer doesn't flush the *out* writer for some reason | |
) | |
;Test 2: speed comparison with regular expressions | |
(comment | |
(let [input " author = \"Cardona A and Saalfeld S\",\n" | |
regex #"( |\t|\n|\r)*([a-zA-Z]+)( |\t|\n|\r)*=( |\t|\n|\r)*(.*)( |\t|\n|\r)*"] | |
(let [s (re-find regex input)] | |
(doseq [[i v] (zipmap (range (count s)) s)] (println i "=>" v))) | |
(time | |
(dotimes [i 1000] | |
(property (lazy-seq input)))) | |
(time | |
(dotimes [i 1000] | |
(re-find regex input)))) | |
) | |
; Set of functions to transform latex text into html text | |
(with-monad parser-m | |
(defmacro match-string | |
[text] | |
`(match-all ~@(map one text))) | |
(defn fn-latex-to-html | |
"Cope with 'declare' not working for calls to the referred name inside domonad, | |
so can't declare ahead 'latex-to-html'. Here, we find it at runtime." | |
[state] | |
(((ns-interns 'my.parse.bib5) 'latex-to-html) state)) | |
(def | |
^{:doc "If \\emph{} is matched, returns a sequence of <i>text</i>"} | |
emph | |
(domonad [;_ (match-string "\\emph{") ; Cannot use macros here: throws ExceptionInInitializationError | |
;_ `(match-all ~@(map one "\\emph{")) ; Fails: Cons cannot be cast to IFn, logically | |
_ (match-all (one \\) (one \e) (one \m) (one \p) (one \h) (one \{)) | |
words fn-latex-to-html ; Cannot use latex-to-html inside domonad when declared prior to defined. | |
_ (one \})] | |
(flatten (map seq ["<i>" words "</i>"])))) | |
(def | |
^{:doc "If \\textbf{} is matched, returns a sequence of <b>text</b>"} | |
textbf | |
(domonad [_ (match-all (one \\) (one \t) (one \e) (one \x) (one \t) (one \b) (one \f) (one \{)) | |
words fn-latex-to-html | |
_ (one \})] | |
(flatten (map seq ["<b>" words "</b>"])))) | |
(def | |
^{:doc "If \\\"a, or another vowel, is matched, returns ä etc." } | |
umlaut | |
(domonad [_ (one \\) | |
_ (one \") | |
vowel (one-of (set "aeiou"))] | |
[\& vowel \u \m \l \;])) | |
(def | |
^{:doc "If \\'a, or another letter, is matched, returns á etc."} | |
acute | |
(domonad [_ (one \\) | |
_ (one \') | |
l letter] | |
[\& l \a \c \u \t \e \;])) | |
(def | |
^{:doc "If \\`a, or another vowel, is matched, returns à etc."} | |
grave | |
(domonad [_ (one \\) | |
_ (one \`) | |
vowel (one-of (set "aeiou"))] | |
[\& vowel \g \r \a \v \e \;])) | |
(def | |
^{:doc "If \\v{a}, or another letter, is matched, returns the letter alone. | |
Simplification serves the purpose."} | |
v-hat | |
(domonad [_ (one \\) | |
_ (one \v) | |
_ (one \{) | |
l letter | |
_ (one \})] | |
l)) | |
(def | |
^{:doc "Matches a TeX block like {one two} and simply removes the {}"} | |
block | |
(domonad [_ (one \{) | |
words fn-latex-to-html ; can't use latex-to-html: doesn't work recursively, or can't see the ahead declaration or this one doesn't get resolved. | |
_ (one \})] | |
(flatten words))) | |
(def | |
^{:doc "Returns a sequence of matched text which may include | |
emph, textbf, etc. translated into their html equivalents. | |
This function is recursive via the calls to it from other | |
functions called within."} | |
latex-to-html | |
(none-or-more | |
; Order matters: | |
(match-one block | |
ext-letter | |
whitespace | |
number | |
acute | |
grave | |
umlaut | |
v-hat | |
emph | |
textbf | |
non-syntax-symbol))) | |
(def | |
^{:doc "A valid word in the author field."} | |
author-word | |
(domonad [word (match-one | |
(match-all ext-upper-case-letter | |
(none-or-more (not-one-of (set " \t\n\r")))) | |
block) | |
_ (optional (match-one (one \.) (one \,)))] | |
word)) | |
(def | |
^{:doc "Match for example 'Cardona, A.' or 'Albert Cardona' or 'Albert T. Cardona' | |
or just about anything that starts with upper case and ends with optionally | |
a whitespace or an 'and'."} | |
one-author | |
(domonad [_ (skip-none-or-more whitespace) | |
first-word author-word | |
rest-words (none-or-more (match-all (one-or-more whitespace) | |
author-word)) | |
_ (optional (match-all (one-or-more whitespace) (one \a) (one \n) (one \d) whitespace))] | |
(apply str (flatten (if rest-words | |
[first-word rest-words] | |
first-word))))) | |
(defn format-authors | |
"Formats authors that all 'and' are replaced by a comma except the last. | |
Returns a String." | |
[^String author-field] | |
; domonad creates a function, invoked here with the author-field as argument. | |
; The function returns the vector pair of value and remaining state, | |
; from which the first, the value, is returned. | |
(first | |
((domonad [authors (one-or-more one-author)] | |
; authors contains a sequence of String, one per author. String doesn't get flattened. | |
(if (= 1 (count authors)) | |
(first authors) | |
(str (clojure.string/join ", " (drop-last authors)) | |
" and " | |
(last authors)))) | |
author-field))) | |
(defn html | |
"Translate latex text into its HTML equivalent. Considers only | |
a small fraction of the possibilities." | |
[^String latex] | |
(let [f (domonad [t latex-to-html] | |
(apply str (flatten t))) | |
[value state] (f (seq latex))] | |
value))) | |
; Multimethods to interpret every entry and output as structured text | |
; Will dispatch on the kind of entry: article, book, inproceedings and incollection | |
(defmulti apalike | |
(fn [x] (x :kind))) | |
(defmethod apalike "article" | |
[entry] | |
(str | |
(if-let [v (entry :author)] (str (html (format-authors v)) \.) "") | |
(if-let [v (entry :year)] (str \space v \.) "") | |
(if-let [v (entry :title)] (str \space (html v) \.) "") | |
(if-let [v (entry :journal)] (str \space v \space) "") | |
(let [vol (entry :volume) | |
num (entry :number) | |
num (if (nil? num) (entry :issue) num)] | |
(cond | |
(and vol num) (str vol \( num \)) | |
(and vol (nil? num)) vol | |
(and (nil? vol) num) num | |
:else "")) | |
(if-let [v (entry :pages)] (str \: v)) | |
\.)) | |
(defn apalike-in | |
"Serves both incollection and inproceedings" | |
[entry] | |
(str | |
(if-let [v (entry :author)] (str (html v) \.) "") | |
(if-let [v (entry :year)] (str \space v \.) "") | |
(if-let [v (entry :title)] (str \space (html v) \.) "") | |
(if-let [v (entry :booktitle)] (str " In: " (html v) \.) "") | |
(if-let [v (entry :series)] (str \space (html v) \.) "") | |
(if-let [v (entry :editor)] (str " Edited by: " (html v) \.) "") | |
(if-let [v (entry :publisher)] (str \space (html v) \.) "") | |
(let [vol (entry :volume) | |
pages (entry :pages)] | |
(cond | |
(and vol pages) (str "Vol. " vol ", pages " pages \.) | |
(and (nil? vol) pages) (str "Pages " pages \.) | |
(and vol (nil? pages)) (str "Vol. " vol \.) | |
:else "")))) | |
(defmethod apalike "inproceedings" | |
[entry] | |
(apalike-in entry)) | |
(defmethod apalike "incollection" | |
[entry] | |
(apalike-in entry)) | |
(defmethod apalike "book" | |
[entry] | |
(str | |
(if-let [v (entry :author)] (str (html v) \.) "") | |
(if-let [v (entry :year)] (str \space v \.) "") | |
(if-let [v (entry :title)] (str \space (html v) \.) "") | |
(if-let [v (entry :booktitle)] (str " In: " (html v) \.) "") | |
(if-let [v (entry :editor)] (str " Edited by: " (html v) \.) "") | |
(let [chapter (entry :chapter) ; chapter number | |
pages (entry :pages)] | |
(cond | |
(and chapter pages) (str " Chapter " chapter ", pages " pages \.) | |
(and (nil? chapter) pages) (str " Pages " pages \.) | |
(and chapter (nil? pages)) (str " Chapter " chapter \.) | |
:else "")))) | |
(defn sort-by-year | |
"Return a sorted collection made from the map of entries, | |
where entries are sorted first by year and second by alias." | |
[es] | |
(vals | |
(reduce | |
(fn [s [k v]] | |
(assoc s (str (v :year) k))) | |
{} | |
es))) | |
; Print out parsed and formatted BibTeX entries | |
(comment | |
(let [state (slurp "/home/albert/webs/trakem2/trakem2_citations.bib") | |
[es state] (entries state)] | |
(println "<ul>") | |
(doseq [[k v] es] | |
(println (str " <li>" (apalike v) "</li>")))) | |
(println "</ul>") | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment