Skip to content

Instantly share code, notes, and snippets.

@larsgw
Created July 19, 2018 21:41
Show Gist options
  • Save larsgw/1d76e1d06ae747e69b9af21f2af079ed to your computer and use it in GitHub Desktop.
Save larsgw/1d76e1d06ae747e69b9af21f2af079ed to your computer and use it in GitHub Desktop.
BibTeX PEG.js grammar
// BibTeX Grammar
// ==============
{
const join = array => array.join('')
const toObject = array => array.reduce((object, [key, value]) => {
object[key] = value
return object
}, {})
// Adapted from [Zotero's reversed mapping table](https://github.com/zotero/translators/blob/master/BibTeX.js#L2353)
// [REPO](https://github.com/zotero/translators)
// Accessed 11/09/2016
const commands = {
"#": "#",
"$": "$",
"%": "%",
"&": "&",
"~": "~",
"_": "_",
"^": "^",
"\\": "\\",
"{": "{",
"}": "}",
"url": "",
"href": "",
"textexclamdown": "\u00a1",
"textcent": "\u00a2",
"textsterling": "\u00a3",
"textyen": "\u00a5",
"textbrokenbar": "\u00a6",
"textsection": "\u00a7",
"textasciidieresis": "\u00a8",
"textcopyright": "\u00a9",
"textordfeminine": "\u00aa",
"guillemotleft": "\u00ab",
"textlnot": "\u00ac",
"textregistered": "\u00ae",
"textasciimacron": "\u00af",
"textdegree": "\u00b0",
"textpm": "\u00b1",
"texttwosuperior": "\u00b2",
"textthreesuperior": "\u00b3",
"textasciiacute": "\u00b4",
"textmu": "\u00b5",
"textparagraph": "\u00b6",
"textperiodcentered": "\u00b7",
"textonesuperior": "\u00b9",
"textordmasculine": "\u00ba",
"guillemotright": "\u00bb",
"textonequarter": "\u00bc",
"textonehalf": "\u00bd",
"textthreequarters": "\u00be",
"textquestiondown": "\u00bf",
"AE": "\u00c6",
"DH": "\u00d0",
"texttimes": "\u00d7",
"O": "\u00d8",
"TH": "\u00de",
"ss": "\u00df",
"ae": "\u00e6",
"dh": "\u00f0",
"textdiv": "\u00f7",
"o": "\u00f8",
"th": "\u00fe",
"i": "\u0131",
"NG": "\u014a",
"ng": "\u014b",
"OE": "\u0152",
"oe": "\u0153",
"L": "\u0141",
"l": "\u0142",
"textasciicircum": "\u02c6",
"textacutedbl": "\u02dd",
"textendash": "\u2013",
"textemdash": "\u2014",
"textbardbl": "\u2016",
"textunderscore": "\u2017",
"textquoteleft": "\u2018",
"textquoteright": "\u2019",
"quotesinglbase": "\u201a",
"textquotedblleft": "\u201c",
"textquotedblright": "\u201d",
"quotedblbase": "\u201e",
"textdagger": "\u2020",
"textdaggerdbl": "\u2021",
"textbullet": "\u2022",
"textellipsis": "\u2026",
"textperthousand": "\u2030",
"guilsinglleft": "\u2039",
"guilsinglright": "\u203a",
"textfractionsolidus": "\u2044",
"texteuro": "\u20ac",
"textcelsius": "\u2103",
"textnumero": "\u2116",
"textcircledP": "\u2117",
"textservicemark": "\u2120",
"texttrademark": "\u2122",
"textohm": "\u2126",
"textestimated": "\u212e",
"Gamma": "\u0393",
"Delta": "\u0394",
"Theta": "\u0398",
"Lambda": "\u039b",
"Xi": "\u039e",
"Pi": "\u03a0",
"Sigma": "\u03a3",
"Phi": "\u03a6",
"Psi": "\u03a8",
"Omega": "\u03a9",
"alpha": "\u03b1",
"beta": "\u03b2",
"gamma": "\u03b3",
"delta": "\u03b4",
"varepsilon": "\u03b5",
"zeta": "\u03b6",
"eta": "\u03b7",
"theta": "\u03b8",
"iota": "\u03b9",
"kappa": "\u03ba",
"lambda": "\u03bb",
"mu": "\u03bc",
"nu": "\u03bd",
"xi": "\u03be",
"pi": "\u03c0",
"rho": "\u03c1",
"varsigma": "\u03c2",
"sigma": "\u03c3",
"tau": "\u03c4",
"upsilon": "\u03c5",
"varphi": "\u03c6",
"chi": "\u03c7",
"psi": "\u03c8",
"omega": "\u03c9",
"vartheta": "\u03d1",
"Upsilon": "\u03d2",
"phi": "\u03d5",
"varpi": "\u03d6",
"varrho": "\u03f1",
"epsilon": "\u03f5"
}
const symbols = {
"---": "\u2014",
"--": "\u2013",
"'''": "\u2034",
"''": "\u201d",
"``": "\u201c",
"```": "\u2037",
"!!": "\u203c",
"?!": "\u2048",
"!?": "\u2049",
"TEL": "\u2121",
"\\~": "\u223c",
"~": "\u00a0"
}
// Adpated from [Astrocite BibTeX](https://github.com/dsifford/astrocite/blob/668a9e4a0cb15a21a310d38e6e3f9ec5af7db9a0/packages/astrocite-bibtex/src/constants.ts#L6-L22)
// Accessed 2018-02-18
const diacritics = {
"`": "\u0300",
"'": "\u0301",
"^": "\u0302",
"~": "\u0303",
"=": "\u0304",
"\"": "\u0308",
"c": "\u0327",
"b": "\u0331",
"u": "\u0306",
"v": "\u030c",
".": "\u0307",
"d": "\u0323",
"r": "\u030a",
"H": "\u030b",
"k": "\u0328"
}
// Adapted from [Wikipedia](https://en.wikipedia.org/wiki/Unicode_subscripts_and_superscripts)
// Accessed 2018-02-18
const mathScripts = {
// superscript
"^": {
"0": "\u2070",
"1": "\u00b9",
"2": "\u00b2",
"3": "\u00b3",
"4": "\u2074",
"5": "\u2075",
"6": "\u2076",
"7": "\u2077",
"8": "\u2078",
"9": "\u2079",
"+": "\u207a",
"-": "\u207b",
"=": "\u207c",
"(": "\u207d",
")": "\u207e",
"n": "\u207f",
"i": "\u2071"
},
// subscript
"_": {
"0": "\u2080",
"1": "\u2081",
"2": "\u2082",
"3": "\u2083",
"4": "\u2084",
"5": "\u2085",
"6": "\u2086",
"7": "\u2087",
"8": "\u2088",
"9": "\u2089",
"+": "\u208a",
"-": "\u208b",
"=": "\u208c",
"(": "\u208d",
")": "\u208e",
"a": "\u2090",
"e": "\u2091",
"o": "\u2092",
"x": "\u2093",
"\u0259": "\u2094",
"h": "\u2095",
"k": "\u2096",
"l": "\u2097",
"m": "\u2098",
"n": "\u2099",
"p": "\u209a",
"s": "\u209b",
"t": "\u209c"
}
}
// Built in string variables. This object also holds variables set by the file
// Adapted from [AstroCite BibTeX](https://github.com/dsifford/astrocite/blob/668a9e4a0cb15a21a310d38e6e3f9ec5af7db9a0/packages/astrocite-bibtex/src/constants.ts#L112-L148)
// Accessed 2018-02-22
const stringStore = {
"jan": "01",
"feb": "02",
"mar": "03",
"apr": "04",
"may": "05",
"jun": "06",
"jul": "07",
"aug": "08",
"sep": "09",
"oct": "10",
"nov": "11",
"dec": "12",
"acmcs": "ACM Computing Surveys",
"acta": "Acta Informatica",
"cacm": "Communications of the ACM",
"ibmjrd": "IBM Journal of Research and Development",
"ibmsj": "IBM Systems Journal",
"ieeese": "IEEE Transactions on Software Engineering",
"ieeetc": "IEEE Transactions on Computers",
"ieeetcad": "IEEE Transactions on Computer-Aided Design of Integrated Circuits",
"ipl": "Information Processing Letters",
"jacm": "Journal of the ACM",
"jcss": "Journal of Computer and System Sciences",
"scp": "Science of Computer Programming",
"sicomp": "SIAM Journal on Computing",
"tocs": "ACM Transactions on Computer Systems",
"tods": "ACM Transactions on Database Systems",
"tog": "ACM Transactions on Graphics",
"toms": "ACM Transactions on Mathematical Software",
"toois": "ACM Transactions on Office Information Systems",
"toplas": "ACM Transactions on Programming Languages and Systems",
"tcs": "Theoretical Computer Science"
}
}
// TODO Better syntax errors
// TODO in-entry comments
// TESTING was [\n\r] [^@] / [^\n\r]
Main
= entries:(Entry / [^\n\r]* [\n\r])*
{ return entries.filter(entry => typeof entry === 'object' && !Array.isArray(entry)) }
// ENTRIES
// -------
Entry
= PreambleEntry / CommentEntry / StringEntry / RefEntry
PreambleEntry
= "@" _h "preamble"i _h [{(] _ value:Value _ [)}]
{
return {type: 'preamble', value}
}
CommentEntry
= "@" _h "comment"i _h value:[^\n\r]* [\n\r]?
{
return {type: 'comment', value: join(value)}
}
StringEntry
= "@" _h "string"i _h [{(] value:Field [)}]
{
stringStore[value[0]] = value[1]
return {type: 'string', value}
}
RefEntry "reference entry"
= "@" _h type:RefType _h [{(] _h label:RefLabel _h "," properties:RefBody [)}]
{
return {type, label, properties}
}
// REFERENCES
// ----------
RefType "reference type"
= type:("article"i / "booklet"i / "book"i / "conference"i / "inbook"i / "incollection"i /
"inproceedings"i / "manual"i / "mastersthesis"i / "misc"i / "phdthesis"i / "proceedings"i /
"techreport"i / "unpunlished"i)
{
return type.toLowerCase()
}
RefLabel "reference label"
= label:[^ \t\n\r,]+
{
return join(label)
}
RefBody "reference body"
= _ fields:Fields _
{ return toObject(fields) }
/ _
{ return {} }
// FIELDS
// ------
Fields "field list"
= fields:DelimitedField* lastField:Field
{
fields.push(lastField)
return fields
}
/ fields:DelimitedField+
{
return fields
}
DelimitedField
= field:Field _h "," _
{ return field }
Field "field"
= key:Key _h "=" _h value:Value
{ return [key, value] }
Key "field name"
= key:[a-z]i+
{ return join(key).toLowerCase() }
// VALUES
// ------
// A value is either a concatenation expression or a concatenable value
Value "field value"
= value:(ExpressionValue / ConcatValue)
{ return parseInt(value) ? parseInt(value) : value.trim().replace(/\s+/g, ' ') }
// Concatenation expressions are of the form 'value ("#" value)+'
ExpressionValue
= left:ConcatValue _h "#" _h right:(ExpressionValue / ConcatValue)
{ return left + right }
// A concatenable value is either a literal value or a variable reference
ConcatValue
= LiteralValue / VariableValue
// Variables start with a letter, and can contain alphanumerical chars and underscores
VariableValue
= variable:([a-z]i [a-z0-9_]i*)
{
const name = variable[0] + join(variable[1]).toLowerCase()
if (!stringStore[name]) throw new Error('Variable "' + name + '" not found')
return stringStore[name]
}
LiteralValue
= BracketValue / QuoteValue / IntegerValue
BracketValue
= "{" value:(BracketValue / BracketString)* "}"
{ return join(value) }
QuoteValue
= '"' value:(BracketValue / QuoteString)* '"'
{ return join(value) }
IntegerValue
= sign:("-" / "") digits:[0-9]+
{ return sign + join(digits) }
BracketString
= value:(Token / [^{}])+
{ return join(value) }
QuoteString
= value:(Token / [^"{}])+
{ return join(value) }
// TOKENS
// ------
Token
= Math / Diacritic / Symbol / Command
Math
= "$" value:(MathCommand / Command) "$"
{ return value }
MathCommand
= mark:[^_] "{" char:[0-9()+\\\-=ni] "}"
{ return mathScripts[mark][char] }
Command
= "\\" name:([#$%&~_^\\{}] / "url" / "href" / "textexclamdown" / "textcent" / "textsterling" / "textyen" / "textbrokenbar" / "textsection" / "textasciidieresis" / "textcopyright" / "textordfeminine" / "guillemotleft" / "textlnot" / "textregistered" / "textasciimacron" / "textdegree" / "textpm" / "texttwosuperior" / "textthreesuperior" / "textasciiacute" / "textmu" / "textparagraph" / "textperiodcentered" / "textonesuperior" / "textordmasculine" / "guillemotright" / "textonequarter" / "textonehalf" / "textthreequarters" / "textquestiondown" / "AE" / "DH" / "texttimes" / "O" / "TH" / "ss" / "ae" / "dh" / "textdiv" / "o" / "th" / "i" / "NG" / "ng" / "OE" / "oe" / "textasciicircum" / "textacutedbl" / "textendash" / "textemdash" / "textbardbl" / "textunderscore" / "textquoteleft" / "textquoteright" / "quotesinglbase" / "textquotedblleft" / "textquotedblright" / "quotedblbase" / "textdagger" / "textdaggerdbl" / "textbullet" / "textellipsis" / "textperthousand" / "guilsinglleft" / "guilsinglright" / "textfractionsolidus" / "texteuro" / "textcelsius" / "textnumero" / "textcircledP" / "textservicemark" / "texttrademark" / "textohm" / "textestimated" / "Gamma" / "Delta" / "Theta" / "Lambda" / "Xi" / "Pi" / "Sigma" / "Phi" / "Psi" / "Omega" / "alpha" / "beta" / "gamma" / "delta" / "varepsilon" / "zeta" / "eta" / "theta" / "iota" / "kappa" / "lambda" / "mu" / "nu" / "xi" / "pi" / "rho" / "varsigma" / "sigma" / "tau" / "upsilon" / "varphi" / "chi" / "psi" / "omega" / "vartheta" / "Upsilon" / "phi" / "varpi" / "varrho" / "epsilon")
{ return commands[name] }
Symbol
= value:("---" / "--" / "'''" / "''" / "```" / "``" / "!!" / "?!" / "!?" / "TEL" / "\\~" / "~")
{ return symbols[value] }
Diacritic
= "\\" mark:DiacriticExtraMark " " char:DiacriticTarget
{ return char + diacritics[mark] }
/ "\\" mark:DiacriticExtraMark "{" char:DiacriticTarget "}"
{ return char + diacritics[mark] }
/ "\\" mark:DiacriticSimpleMark char:DiacriticTarget
{ return char + diacritics[mark] }
/ "\\" mark:DiacriticSimpleMark "{" char:DiacriticTarget "}"
{ return char + diacritics[mark] }
DiacriticTarget = Command / [a-z]i
DiacriticSimpleMark = [`'"^=~.]
DiacriticExtraMark = [bcdHkruv]
// MISC
// ----
_ "optional whitespace"
= [ \t\n\r]*
_h "horizontal whitespace"
= [ \t]*
_v "vertical whitespace"
= [\n\r]*
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment