Last active
October 14, 2021 20:44
-
-
Save erantapaa/5a2614adde0526d25c03 to your computer and use it in GitHub Desktop.
BibTeX file parsing Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Simple BibTeX file parsing in python. | |
# | |
# See `bibtest1` for an example of usage. | |
# | |
# This is a good overview of how to correctly parse a bibtex file: | |
# | |
# http://maverick.inria.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html | |
import string | |
wordLetters = string.ascii_lowercase + string.ascii_uppercase + string.digits | |
class ParseError(Exception): | |
def __init__(self, msg): | |
self.msg = msg | |
def __str__(self): | |
return "parse error: " + self.msg | |
class Tokenizer: | |
def __init__(self, buf): | |
self.buf = buf | |
self.i = 0 | |
self.len = len(buf) | |
def peek(self): | |
return self.buf[self.i] | |
def skipwhite(self): | |
"""Skip white space""" | |
i = self.i | |
while i < self.len and self.buf[i] in string.whitespace: | |
i += 1 | |
self.i = i | |
def try_match_string(self, s): | |
return self.buf[self.i:self.i+len(s)] == s | |
def try_match_letter(self): | |
return self.buf[self.i] in wordLetters | |
def match_string(self, s): | |
"""Match a literal string and skip following white space""" | |
if self.buf[self.i:self.i+len(s)] == s: | |
self.i += len(s) | |
self.skipwhite() | |
return True | |
else: | |
raise ParseError("expecting " + s) | |
def match_word(self): | |
"""Match an identifier and skip following white space""" | |
j = self.i | |
while j < self.len and self.buf[j] in wordLetters: | |
j += 1 | |
s = self.buf[self.i:j] | |
if len(s) > 0: | |
self.i = j | |
self.skipwhite() | |
return s | |
else: | |
raise ParseError("expecting identifier") | |
def skiptoat(self): | |
"""Skip to an @ followed by a letter""" | |
j = self.i | |
while j < self.len: | |
if j < self.len-2 and self.buf[j] == '@' and self.buf[j+1] in wordLetters: | |
self.i = j | |
return True | |
j += 1 | |
return None | |
def skipToEOL(self): | |
j = self.i | |
while j < self.len and self.buf[j] <> '\n': | |
j += 1 | |
self.i = j | |
def scanString(self): | |
j = self.i | |
if self.buf[j] == '"': | |
s, k = self.scanQuotedString(j+1) | |
elif self.buf[j] == '{': | |
s, k = self.scanBraceString(j+1) | |
else: | |
raise ParseError("not at a string") | |
self.i = k | |
return s | |
def scanQuotedString(self, j): | |
"""Returns index of character after ending double-quote""" | |
s = "" | |
while j < self.len: | |
ch = self.buf[j] | |
if ch == '"': | |
return s, j+1 | |
elif ch == '{': | |
t, j = self.scanBraceString(j+1) | |
s += '{' + t + '}' | |
else: | |
s += ch | |
j += 1 | |
raise ParseError("unterminated double quote string") | |
def scanBraceString(self, j): | |
"""Returns index of character following ending brace""" | |
lvl = 1 | |
k = j | |
while k < self.len: | |
ch = self.buf[k] | |
if ch == '}': | |
lvl -= 1 | |
if lvl <= 0: | |
return self.buf[j:k], k+1 | |
elif ch == '{': | |
lvl += 1 | |
k += 1 | |
raise ParseError("unterminated brace string") | |
def test1(): | |
t = Tokenizer(" X ") | |
t.skipwhite() | |
x = t.match_word() | |
assert x == "X" | |
return (True,x) | |
def test2(): | |
t = Tokenizer(" X yzzy ") | |
t.skipwhite() | |
x = t.match_word() | |
y = t.match_word() | |
assert x == "X" and y == "yzzy" | |
return (True,x, y) | |
def test3(): | |
t = Tokenizer('" { " } "xyz') | |
s = t.scanString() | |
w = t.match_word() | |
assert (s == ' { " } ' and w == "xyz"), (s, w) | |
return (True, s, w) | |
# BibTeX parsing routines | |
def parse_entries(t): | |
entries = [] | |
while t.skiptoat(): | |
t.match_string('@') | |
w = t.match_word() | |
if w.lower() == 'comment': | |
t.skipToEOL() | |
continue | |
ch = t.peek() | |
if ch not in "{(": | |
ParseError("expecting either { or (") | |
t.match_string(ch) # always succeeds | |
ident = t.match_word() | |
t.match_string(',') | |
pairs = parse_kv_pairs(t) | |
entries.append( (w, ident, pairs) ) | |
# no need to check ending ) or } - skiptoat() will skip over it | |
# reached EOF | |
return entries | |
def parse_kv(t): | |
key = t.match_word() | |
t.match_string('=') | |
vals = [] | |
while True: | |
ch = t.peek() | |
if ch == '"' or ch == '{': | |
v = t.scanString() | |
vals.append( ("string", v) ) | |
elif ch == '#': | |
t.match_string('#') | |
continue | |
elif ch in wordLetters: | |
w = t.match_word() | |
vals.append( ("ident", w) ) | |
else: | |
break | |
return (key, vals) | |
def parse_kv_pairs(t): | |
pairs = [] | |
while True: | |
ch = t.peek() | |
if ch in wordLetters: | |
kv = parse_kv(t) | |
pairs.append(kv) | |
if t.try_match_string(","): | |
t.match_string(",") | |
else: | |
break | |
return pairs | |
bib1 = """ | |
% a sample bibliography file | |
% | |
@article{small, | |
author = {Freely, I.P.}, | |
title = {A small paper}, | |
journal = {The journal of small papers}, | |
year = 1997, | |
volume = {-1}, | |
note = {to appear}, | |
} | |
@comment this entire line is a comment @foo { | |
@article(big, | |
author = {Jass, Hugh}, | |
title = {A big paper}, | |
journal = {The journal of big papers}, | |
year = 7991 # foo, | |
volume = {MCMXCVII}, | |
) | |
% The authors mentioned here are almost, but not quite, | |
% entirely unrelated to Matt Groening. | |
""" | |
import pprint | |
def bibtest1(): | |
t = Tokenizer(bib1) | |
pp = pprint.PrettyPrinter(indent=4) | |
r = parse_entries(t) | |
pp.pprint(r) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
where is the output?