Created
July 19, 2018 21:41
-
-
Save larsgw/1d76e1d06ae747e69b9af21f2af079ed to your computer and use it in GitHub Desktop.
BibTeX PEG.js grammar
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// BibTeX Grammar | |
// ============== | |
{ | |
const join = array => array.join('') | |
const toObject = array => array.reduce((object, [key, value]) => { | |
object[key] = value | |
return object | |
}, {}) | |
// Adapted from [Zotero's reversed mapping table](https://github.com/zotero/translators/blob/master/BibTeX.js#L2353) | |
// [REPO](https://github.com/zotero/translators) | |
// Accessed 11/09/2016 | |
const commands = { | |
"#": "#", | |
"$": "$", | |
"%": "%", | |
"&": "&", | |
"~": "~", | |
"_": "_", | |
"^": "^", | |
"\\": "\\", | |
"{": "{", | |
"}": "}", | |
"url": "", | |
"href": "", | |
"textexclamdown": "\u00a1", | |
"textcent": "\u00a2", | |
"textsterling": "\u00a3", | |
"textyen": "\u00a5", | |
"textbrokenbar": "\u00a6", | |
"textsection": "\u00a7", | |
"textasciidieresis": "\u00a8", | |
"textcopyright": "\u00a9", | |
"textordfeminine": "\u00aa", | |
"guillemotleft": "\u00ab", | |
"textlnot": "\u00ac", | |
"textregistered": "\u00ae", | |
"textasciimacron": "\u00af", | |
"textdegree": "\u00b0", | |
"textpm": "\u00b1", | |
"texttwosuperior": "\u00b2", | |
"textthreesuperior": "\u00b3", | |
"textasciiacute": "\u00b4", | |
"textmu": "\u00b5", | |
"textparagraph": "\u00b6", | |
"textperiodcentered": "\u00b7", | |
"textonesuperior": "\u00b9", | |
"textordmasculine": "\u00ba", | |
"guillemotright": "\u00bb", | |
"textonequarter": "\u00bc", | |
"textonehalf": "\u00bd", | |
"textthreequarters": "\u00be", | |
"textquestiondown": "\u00bf", | |
"AE": "\u00c6", | |
"DH": "\u00d0", | |
"texttimes": "\u00d7", | |
"O": "\u00d8", | |
"TH": "\u00de", | |
"ss": "\u00df", | |
"ae": "\u00e6", | |
"dh": "\u00f0", | |
"textdiv": "\u00f7", | |
"o": "\u00f8", | |
"th": "\u00fe", | |
"i": "\u0131", | |
"NG": "\u014a", | |
"ng": "\u014b", | |
"OE": "\u0152", | |
"oe": "\u0153", | |
"L": "\u0141", | |
"l": "\u0142", | |
"textasciicircum": "\u02c6", | |
"textacutedbl": "\u02dd", | |
"textendash": "\u2013", | |
"textemdash": "\u2014", | |
"textbardbl": "\u2016", | |
"textunderscore": "\u2017", | |
"textquoteleft": "\u2018", | |
"textquoteright": "\u2019", | |
"quotesinglbase": "\u201a", | |
"textquotedblleft": "\u201c", | |
"textquotedblright": "\u201d", | |
"quotedblbase": "\u201e", | |
"textdagger": "\u2020", | |
"textdaggerdbl": "\u2021", | |
"textbullet": "\u2022", | |
"textellipsis": "\u2026", | |
"textperthousand": "\u2030", | |
"guilsinglleft": "\u2039", | |
"guilsinglright": "\u203a", | |
"textfractionsolidus": "\u2044", | |
"texteuro": "\u20ac", | |
"textcelsius": "\u2103", | |
"textnumero": "\u2116", | |
"textcircledP": "\u2117", | |
"textservicemark": "\u2120", | |
"texttrademark": "\u2122", | |
"textohm": "\u2126", | |
"textestimated": "\u212e", | |
"Gamma": "\u0393", | |
"Delta": "\u0394", | |
"Theta": "\u0398", | |
"Lambda": "\u039b", | |
"Xi": "\u039e", | |
"Pi": "\u03a0", | |
"Sigma": "\u03a3", | |
"Phi": "\u03a6", | |
"Psi": "\u03a8", | |
"Omega": "\u03a9", | |
"alpha": "\u03b1", | |
"beta": "\u03b2", | |
"gamma": "\u03b3", | |
"delta": "\u03b4", | |
"varepsilon": "\u03b5", | |
"zeta": "\u03b6", | |
"eta": "\u03b7", | |
"theta": "\u03b8", | |
"iota": "\u03b9", | |
"kappa": "\u03ba", | |
"lambda": "\u03bb", | |
"mu": "\u03bc", | |
"nu": "\u03bd", | |
"xi": "\u03be", | |
"pi": "\u03c0", | |
"rho": "\u03c1", | |
"varsigma": "\u03c2", | |
"sigma": "\u03c3", | |
"tau": "\u03c4", | |
"upsilon": "\u03c5", | |
"varphi": "\u03c6", | |
"chi": "\u03c7", | |
"psi": "\u03c8", | |
"omega": "\u03c9", | |
"vartheta": "\u03d1", | |
"Upsilon": "\u03d2", | |
"phi": "\u03d5", | |
"varpi": "\u03d6", | |
"varrho": "\u03f1", | |
"epsilon": "\u03f5" | |
} | |
const symbols = { | |
"---": "\u2014", | |
"--": "\u2013", | |
"'''": "\u2034", | |
"''": "\u201d", | |
"``": "\u201c", | |
"```": "\u2037", | |
"!!": "\u203c", | |
"?!": "\u2048", | |
"!?": "\u2049", | |
"TEL": "\u2121", | |
"\\~": "\u223c", | |
"~": "\u00a0" | |
} | |
// Adpated from [Astrocite BibTeX](https://github.com/dsifford/astrocite/blob/668a9e4a0cb15a21a310d38e6e3f9ec5af7db9a0/packages/astrocite-bibtex/src/constants.ts#L6-L22) | |
// Accessed 2018-02-18 | |
const diacritics = { | |
"`": "\u0300", | |
"'": "\u0301", | |
"^": "\u0302", | |
"~": "\u0303", | |
"=": "\u0304", | |
"\"": "\u0308", | |
"c": "\u0327", | |
"b": "\u0331", | |
"u": "\u0306", | |
"v": "\u030c", | |
".": "\u0307", | |
"d": "\u0323", | |
"r": "\u030a", | |
"H": "\u030b", | |
"k": "\u0328" | |
} | |
// Adapted from [Wikipedia](https://en.wikipedia.org/wiki/Unicode_subscripts_and_superscripts) | |
// Accessed 2018-02-18 | |
const mathScripts = { | |
// superscript | |
"^": { | |
"0": "\u2070", | |
"1": "\u00b9", | |
"2": "\u00b2", | |
"3": "\u00b3", | |
"4": "\u2074", | |
"5": "\u2075", | |
"6": "\u2076", | |
"7": "\u2077", | |
"8": "\u2078", | |
"9": "\u2079", | |
"+": "\u207a", | |
"-": "\u207b", | |
"=": "\u207c", | |
"(": "\u207d", | |
")": "\u207e", | |
"n": "\u207f", | |
"i": "\u2071" | |
}, | |
// subscript | |
"_": { | |
"0": "\u2080", | |
"1": "\u2081", | |
"2": "\u2082", | |
"3": "\u2083", | |
"4": "\u2084", | |
"5": "\u2085", | |
"6": "\u2086", | |
"7": "\u2087", | |
"8": "\u2088", | |
"9": "\u2089", | |
"+": "\u208a", | |
"-": "\u208b", | |
"=": "\u208c", | |
"(": "\u208d", | |
")": "\u208e", | |
"a": "\u2090", | |
"e": "\u2091", | |
"o": "\u2092", | |
"x": "\u2093", | |
"\u0259": "\u2094", | |
"h": "\u2095", | |
"k": "\u2096", | |
"l": "\u2097", | |
"m": "\u2098", | |
"n": "\u2099", | |
"p": "\u209a", | |
"s": "\u209b", | |
"t": "\u209c" | |
} | |
} | |
// Built in string variables. This object also holds variables set by the file | |
// Adapted from [AstroCite BibTeX](https://github.com/dsifford/astrocite/blob/668a9e4a0cb15a21a310d38e6e3f9ec5af7db9a0/packages/astrocite-bibtex/src/constants.ts#L112-L148) | |
// Accessed 2018-02-22 | |
const stringStore = { | |
"jan": "01", | |
"feb": "02", | |
"mar": "03", | |
"apr": "04", | |
"may": "05", | |
"jun": "06", | |
"jul": "07", | |
"aug": "08", | |
"sep": "09", | |
"oct": "10", | |
"nov": "11", | |
"dec": "12", | |
"acmcs": "ACM Computing Surveys", | |
"acta": "Acta Informatica", | |
"cacm": "Communications of the ACM", | |
"ibmjrd": "IBM Journal of Research and Development", | |
"ibmsj": "IBM Systems Journal", | |
"ieeese": "IEEE Transactions on Software Engineering", | |
"ieeetc": "IEEE Transactions on Computers", | |
"ieeetcad": "IEEE Transactions on Computer-Aided Design of Integrated Circuits", | |
"ipl": "Information Processing Letters", | |
"jacm": "Journal of the ACM", | |
"jcss": "Journal of Computer and System Sciences", | |
"scp": "Science of Computer Programming", | |
"sicomp": "SIAM Journal on Computing", | |
"tocs": "ACM Transactions on Computer Systems", | |
"tods": "ACM Transactions on Database Systems", | |
"tog": "ACM Transactions on Graphics", | |
"toms": "ACM Transactions on Mathematical Software", | |
"toois": "ACM Transactions on Office Information Systems", | |
"toplas": "ACM Transactions on Programming Languages and Systems", | |
"tcs": "Theoretical Computer Science" | |
} | |
} | |
// TODO Better syntax errors | |
// TODO in-entry comments | |
// TESTING was [\n\r] [^@] / [^\n\r] | |
Main | |
= entries:(Entry / [^\n\r]* [\n\r])* | |
{ return entries.filter(entry => typeof entry === 'object' && !Array.isArray(entry)) } | |
// ENTRIES | |
// ------- | |
Entry | |
= PreambleEntry / CommentEntry / StringEntry / RefEntry | |
PreambleEntry | |
= "@" _h "preamble"i _h [{(] _ value:Value _ [)}] | |
{ | |
return {type: 'preamble', value} | |
} | |
CommentEntry | |
= "@" _h "comment"i _h value:[^\n\r]* [\n\r]? | |
{ | |
return {type: 'comment', value: join(value)} | |
} | |
StringEntry | |
= "@" _h "string"i _h [{(] value:Field [)}] | |
{ | |
stringStore[value[0]] = value[1] | |
return {type: 'string', value} | |
} | |
RefEntry "reference entry" | |
= "@" _h type:RefType _h [{(] _h label:RefLabel _h "," properties:RefBody [)}] | |
{ | |
return {type, label, properties} | |
} | |
// REFERENCES | |
// ---------- | |
RefType "reference type" | |
= type:("article"i / "booklet"i / "book"i / "conference"i / "inbook"i / "incollection"i / | |
"inproceedings"i / "manual"i / "mastersthesis"i / "misc"i / "phdthesis"i / "proceedings"i / | |
"techreport"i / "unpunlished"i) | |
{ | |
return type.toLowerCase() | |
} | |
RefLabel "reference label" | |
= label:[^ \t\n\r,]+ | |
{ | |
return join(label) | |
} | |
RefBody "reference body" | |
= _ fields:Fields _ | |
{ return toObject(fields) } | |
/ _ | |
{ return {} } | |
// FIELDS | |
// ------ | |
Fields "field list" | |
= fields:DelimitedField* lastField:Field | |
{ | |
fields.push(lastField) | |
return fields | |
} | |
/ fields:DelimitedField+ | |
{ | |
return fields | |
} | |
DelimitedField | |
= field:Field _h "," _ | |
{ return field } | |
Field "field" | |
= key:Key _h "=" _h value:Value | |
{ return [key, value] } | |
Key "field name" | |
= key:[a-z]i+ | |
{ return join(key).toLowerCase() } | |
// VALUES | |
// ------ | |
// A value is either a concatenation expression or a concatenable value | |
Value "field value" | |
= value:(ExpressionValue / ConcatValue) | |
{ return parseInt(value) ? parseInt(value) : value.trim().replace(/\s+/g, ' ') } | |
// Concatenation expressions are of the form 'value ("#" value)+' | |
ExpressionValue | |
= left:ConcatValue _h "#" _h right:(ExpressionValue / ConcatValue) | |
{ return left + right } | |
// A concatenable value is either a literal value or a variable reference | |
ConcatValue | |
= LiteralValue / VariableValue | |
// Variables start with a letter, and can contain alphanumerical chars and underscores | |
VariableValue | |
= variable:([a-z]i [a-z0-9_]i*) | |
{ | |
const name = variable[0] + join(variable[1]).toLowerCase() | |
if (!stringStore[name]) throw new Error('Variable "' + name + '" not found') | |
return stringStore[name] | |
} | |
LiteralValue | |
= BracketValue / QuoteValue / IntegerValue | |
BracketValue | |
= "{" value:(BracketValue / BracketString)* "}" | |
{ return join(value) } | |
QuoteValue | |
= '"' value:(BracketValue / QuoteString)* '"' | |
{ return join(value) } | |
IntegerValue | |
= sign:("-" / "") digits:[0-9]+ | |
{ return sign + join(digits) } | |
BracketString | |
= value:(Token / [^{}])+ | |
{ return join(value) } | |
QuoteString | |
= value:(Token / [^"{}])+ | |
{ return join(value) } | |
// TOKENS | |
// ------ | |
Token | |
= Math / Diacritic / Symbol / Command | |
Math | |
= "$" value:(MathCommand / Command) "$" | |
{ return value } | |
MathCommand | |
= mark:[^_] "{" char:[0-9()+\\\-=ni] "}" | |
{ return mathScripts[mark][char] } | |
Command | |
= "\\" name:([#$%&~_^\\{}] / "url" / "href" / "textexclamdown" / "textcent" / "textsterling" / "textyen" / "textbrokenbar" / "textsection" / "textasciidieresis" / "textcopyright" / "textordfeminine" / "guillemotleft" / "textlnot" / "textregistered" / "textasciimacron" / "textdegree" / "textpm" / "texttwosuperior" / "textthreesuperior" / "textasciiacute" / "textmu" / "textparagraph" / "textperiodcentered" / "textonesuperior" / "textordmasculine" / "guillemotright" / "textonequarter" / "textonehalf" / "textthreequarters" / "textquestiondown" / "AE" / "DH" / "texttimes" / "O" / "TH" / "ss" / "ae" / "dh" / "textdiv" / "o" / "th" / "i" / "NG" / "ng" / "OE" / "oe" / "textasciicircum" / "textacutedbl" / "textendash" / "textemdash" / "textbardbl" / "textunderscore" / "textquoteleft" / "textquoteright" / "quotesinglbase" / "textquotedblleft" / "textquotedblright" / "quotedblbase" / "textdagger" / "textdaggerdbl" / "textbullet" / "textellipsis" / "textperthousand" / "guilsinglleft" / "guilsinglright" / "textfractionsolidus" / "texteuro" / "textcelsius" / "textnumero" / "textcircledP" / "textservicemark" / "texttrademark" / "textohm" / "textestimated" / "Gamma" / "Delta" / "Theta" / "Lambda" / "Xi" / "Pi" / "Sigma" / "Phi" / "Psi" / "Omega" / "alpha" / "beta" / "gamma" / "delta" / "varepsilon" / "zeta" / "eta" / "theta" / "iota" / "kappa" / "lambda" / "mu" / "nu" / "xi" / "pi" / "rho" / "varsigma" / "sigma" / "tau" / "upsilon" / "varphi" / "chi" / "psi" / "omega" / "vartheta" / "Upsilon" / "phi" / "varpi" / "varrho" / "epsilon") | |
{ return commands[name] } | |
Symbol | |
= value:("---" / "--" / "'''" / "''" / "```" / "``" / "!!" / "?!" / "!?" / "TEL" / "\\~" / "~") | |
{ return symbols[value] } | |
Diacritic | |
= "\\" mark:DiacriticExtraMark " " char:DiacriticTarget | |
{ return char + diacritics[mark] } | |
/ "\\" mark:DiacriticExtraMark "{" char:DiacriticTarget "}" | |
{ return char + diacritics[mark] } | |
/ "\\" mark:DiacriticSimpleMark char:DiacriticTarget | |
{ return char + diacritics[mark] } | |
/ "\\" mark:DiacriticSimpleMark "{" char:DiacriticTarget "}" | |
{ return char + diacritics[mark] } | |
DiacriticTarget = Command / [a-z]i | |
DiacriticSimpleMark = [`'"^=~.] | |
DiacriticExtraMark = [bcdHkruv] | |
// MISC | |
// ---- | |
_ "optional whitespace" | |
= [ \t\n\r]* | |
_h "horizontal whitespace" | |
= [ \t]* | |
_v "vertical whitespace" | |
= [\n\r]* |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment