erantapaa · October 14, 2021 20:44 · pk1811 · Jan 27, 2017
diff --git a/bibparse.py b/bibparse.py
 #
 # Simple BibTeX file parsing in python.
 #
 # See `bibtest1` for an example of usage.
 #
 # This is a good overview of how to correctly parse a bibtex file:
 #
 #   http://maverick.inria.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html

 import string

 wordLetters = string.ascii_lowercase + string.ascii_uppercase + string.digits

 class ParseError(Exception):
  def __init__(self, msg):
    self.msg = msg
  def __str__(self):
    return "parse error: " + self.msg

 class Tokenizer:
  def __init__(self, buf):
    self.buf = buf
    self.i = 0
    self.len = len(buf)

  def peek(self):
    return self.buf[self.i]

  def skipwhite(self):
    """Skip white space"""
    i = self.i
    while i < self.len and self.buf[i] in string.whitespace:
      i += 1
    self.i = i

  def try_match_string(self, s):
    return self.buf[self.i:self.i+len(s)] == s

  def try_match_letter(self):
    return self.buf[self.i] in wordLetters

  def match_string(self, s):
    """Match a literal string and skip following white space"""
    if self.buf[self.i:self.i+len(s)] == s:
      self.i += len(s)
      self.skipwhite()
      return True
    else:
      raise ParseError("expecting " + s)

  def match_word(self):
    """Match an identifier and skip following white space"""
    j = self.i
    while j < self.len and self.buf[j] in wordLetters:
      j += 1
    s = self.buf[self.i:j]
    if len(s) > 0:
      self.i = j
      self.skipwhite()
      return s
    else:
      raise ParseError("expecting identifier")

  def skiptoat(self):
    """Skip to an @ followed by a letter"""
    j = self.i
    while j < self.len:
      if j < self.len-2 and self.buf[j] == '@' and self.buf[j+1] in wordLetters:
        self.i = j
        return True
      j += 1
    return None

  def skipToEOL(self):
    j = self.i
    while j < self.len and self.buf[j] <> '\n':
      j += 1
    self.i = j

  def scanString(self):
    j = self.i
    if self.buf[j] == '"':
      s, k = self.scanQuotedString(j+1)
    elif self.buf[j] == '{':
      s, k = self.scanBraceString(j+1)
    else:
      raise ParseError("not at a string")
    self.i = k
    return s

  def scanQuotedString(self, j):
    """Returns index of character after ending double-quote"""
    s = ""
    while j < self.len:
      ch = self.buf[j]
      if ch == '"':
        return s, j+1
      elif ch == '{':
        t, j = self.scanBraceString(j+1)
        s += '{' + t + '}'
      else:
        s += ch
        j += 1
    raise ParseError("unterminated double quote string")

  def scanBraceString(self, j):
    """Returns index of character following ending brace"""
    lvl = 1
    k = j
    while k < self.len:
      ch = self.buf[k]
      if ch == '}':
        lvl -= 1
        if lvl <= 0:
          return self.buf[j:k], k+1
      elif ch == '{':
        lvl += 1
      k += 1
    raise ParseError("unterminated brace string")

 def test1():
  t = Tokenizer("   X   ")
  t.skipwhite()
  x = t.match_word()
  assert x == "X"
  return (True,x)

 def test2():
  t = Tokenizer("   X   yzzy ")
  t.skipwhite()
  x = t.match_word()
  y = t.match_word()
  assert x == "X" and y == "yzzy"
  return (True,x, y)

 def test3():
  t = Tokenizer('" { " } "xyz')
  s = t.scanString()
  w = t.match_word()
  assert (s == ' { " } ' and w == "xyz"), (s, w)
  return (True, s, w)

 # BibTeX parsing routines

 def parse_entries(t):
  entries = []
  while t.skiptoat():
    t.match_string('@')
    w = t.match_word()
    if w.lower() == 'comment':
      t.skipToEOL()
      continue
    ch = t.peek()
    if ch not in "{(":
      ParseError("expecting either { or (")
    t.match_string(ch)      # always succeeds
    ident = t.match_word()
    t.match_string(',')
    pairs = parse_kv_pairs(t)
    entries.append( (w, ident, pairs) )
    # no need to check ending ) or } - skiptoat() will skip over it
  # reached EOF
  return entries

 def parse_kv(t):
  key = t.match_word()
  t.match_string('=')
  vals = []
  while True:
    ch = t.peek()
    if ch == '"' or ch == '{':
      v = t.scanString()
      vals.append( ("string", v) )
    elif ch == '#':
      t.match_string('#')
      continue
    elif ch in wordLetters:
      w = t.match_word()
      vals.append( ("ident", w) )
    else:
      break
  return (key, vals)

 def parse_kv_pairs(t):
  pairs = []
  while True:
    ch = t.peek()
    if ch in wordLetters:
      kv = parse_kv(t)
      pairs.append(kv)
    if t.try_match_string(","):
      t.match_string(",")
    else:
      break
  return pairs

 bib1 = """
 %  a sample bibliography file
 %  

 @article{small,
 author = {Freely, I.P.},
 title = {A small paper},
 journal = {The journal of small papers},
 year = 1997,
 volume = {-1},
 note = {to appear},
 }

 @comment this entire line is a comment @foo {

 @article(big,
 author = {Jass, Hugh},
 title = {A big paper},
 journal = {The journal of big papers},
 year = 7991 # foo,
 volume = {MCMXCVII},
 )

 %  The authors mentioned here are almost, but not quite,
 %  entirely unrelated to Matt Groening.
 """

 import pprint

 def bibtest1():
  t = Tokenizer(bib1)
  pp = pprint.PrettyPrinter(indent=4)
  r = parse_entries(t)
  pp.pprint(r)
	#
	# Simple BibTeX file parsing in python.
	#
	# See `bibtest1` for an example of usage.
	#
	# This is a good overview of how to correctly parse a bibtex file:
	#
	# http://maverick.inria.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html

	import string

	wordLetters = string.ascii_lowercase + string.ascii_uppercase + string.digits

	class ParseError(Exception):
	def __init__(self, msg):
	self.msg = msg
	def __str__(self):
	return "parse error: " + self.msg

	class Tokenizer:
	def __init__(self, buf):
	self.buf = buf
	self.i = 0
	self.len = len(buf)

	def peek(self):
	return self.buf[self.i]

	def skipwhite(self):
	"""Skip white space"""
	i = self.i
	while i < self.len and self.buf[i] in string.whitespace:
	i += 1
	self.i = i

	def try_match_string(self, s):
	return self.buf[self.i:self.i+len(s)] == s

	def try_match_letter(self):
	return self.buf[self.i] in wordLetters

	def match_string(self, s):
	"""Match a literal string and skip following white space"""
	if self.buf[self.i:self.i+len(s)] == s:
	self.i += len(s)
	self.skipwhite()
	return True
	else:
	raise ParseError("expecting " + s)

	def match_word(self):
	"""Match an identifier and skip following white space"""
	j = self.i
	while j < self.len and self.buf[j] in wordLetters:
	j += 1
	s = self.buf[self.i:j]
	if len(s) > 0:
	self.i = j
	self.skipwhite()
	return s
	else:
	raise ParseError("expecting identifier")

	def skiptoat(self):
	"""Skip to an @ followed by a letter"""
	j = self.i
	while j < self.len:
	if j < self.len-2 and self.buf[j] == '@' and self.buf[j+1] in wordLetters:
	self.i = j
	return True
	j += 1
	return None

	def skipToEOL(self):
	j = self.i
	while j < self.len and self.buf[j] <> '\n':
	j += 1
	self.i = j

	def scanString(self):
	j = self.i
	if self.buf[j] == '"':
	s, k = self.scanQuotedString(j+1)
	elif self.buf[j] == '{':
	s, k = self.scanBraceString(j+1)
	else:
	raise ParseError("not at a string")
	self.i = k
	return s

	def scanQuotedString(self, j):
	"""Returns index of character after ending double-quote"""
	s = ""
	while j < self.len:
	ch = self.buf[j]
	if ch == '"':
	return s, j+1
	elif ch == '{':
	t, j = self.scanBraceString(j+1)
	s += '{' + t + '}'
	else:
	s += ch
	j += 1
	raise ParseError("unterminated double quote string")

	def scanBraceString(self, j):
	"""Returns index of character following ending brace"""
	lvl = 1
	k = j
	while k < self.len:
	ch = self.buf[k]
	if ch == '}':
	lvl -= 1
	if lvl <= 0:
	return self.buf[j:k], k+1
	elif ch == '{':
	lvl += 1
	k += 1
	raise ParseError("unterminated brace string")

	def test1():
	t = Tokenizer(" X ")
	t.skipwhite()
	x = t.match_word()
	assert x == "X"
	return (True,x)

	def test2():
	t = Tokenizer(" X yzzy ")
	t.skipwhite()
	x = t.match_word()
	y = t.match_word()
	assert x == "X" and y == "yzzy"
	return (True,x, y)

	def test3():
	t = Tokenizer('" { " } "xyz')
	s = t.scanString()
	w = t.match_word()
	assert (s == ' { " } ' and w == "xyz"), (s, w)
	return (True, s, w)

	# BibTeX parsing routines

	def parse_entries(t):
	entries = []
	while t.skiptoat():
	t.match_string('@')
	w = t.match_word()
	if w.lower() == 'comment':
	t.skipToEOL()
	continue
	ch = t.peek()
	if ch not in "{(":
	ParseError("expecting either { or (")
	t.match_string(ch) # always succeeds
	ident = t.match_word()
	t.match_string(',')
	pairs = parse_kv_pairs(t)
	entries.append( (w, ident, pairs) )
	# no need to check ending ) or } - skiptoat() will skip over it
	# reached EOF
	return entries

	def parse_kv(t):
	key = t.match_word()
	t.match_string('=')
	vals = []
	while True:
	ch = t.peek()
	if ch == '"' or ch == '{':
	v = t.scanString()
	vals.append( ("string", v) )
	elif ch == '#':
	t.match_string('#')
	continue
	elif ch in wordLetters:
	w = t.match_word()
	vals.append( ("ident", w) )
	else:
	break
	return (key, vals)

	def parse_kv_pairs(t):
	pairs = []
	while True:
	ch = t.peek()
	if ch in wordLetters:
	kv = parse_kv(t)
	pairs.append(kv)
	if t.try_match_string(","):
	t.match_string(",")
	else:
	break
	return pairs

	bib1 = """
	% a sample bibliography file
	%

	@article{small,
	author = {Freely, I.P.},
	title = {A small paper},
	journal = {The journal of small papers},
	year = 1997,
	volume = {-1},
	note = {to appear},
	}

	@comment this entire line is a comment @foo {

	@article(big,
	author = {Jass, Hugh},
	title = {A big paper},
	journal = {The journal of big papers},
	year = 7991 # foo,
	volume = {MCMXCVII},
	)

	% The authors mentioned here are almost, but not quite,
	% entirely unrelated to Matt Groening.
	"""

	import pprint

	def bibtest1():
	t = Tokenizer(bib1)
	pp = pprint.PrettyPrinter(indent=4)
	r = parse_entries(t)
	pp.pprint(r)