Last active
July 13, 2017 01:08
-
-
Save fauxneticien/ff9935d66ba27e472bd3a2c9dc22ab6d to your computer and use it in GitHub Desktop.
Parse backslash-coded lexicon using a defined grammar
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# Usage: pass in a grammar from a text file, or define them chunk-by-chunk as following arguments | |
# python chunker.py < lexicon.txt "xml" $(cat grammar.txt) | |
# python chunker.py < lexicon.txt "json" "examples:{<text><translation>}" "headword:{<lx><ps><examples>*}" | |
import cStringIO, json, sys, xmltodict, xml.dom.minidom | |
from toolz.functoolz import pipe | |
from nltk.toolbox import ToolboxData | |
from xml.etree.ElementTree import ElementTree | |
# Expecting lexicon data as redirect from STDIN | |
ToolboxData = ToolboxData() | |
ToolboxData._file = sys.stdin | |
# "json" or "xml" | |
output_type = sys.argv[1] | |
# Expecting grammar definition as following arguments | |
GRAMMAR = "\n".join(sys.argv[2:]) | |
# Use StringIO to avoid having to do actual file I/O | |
xml_temp = cStringIO.StringIO() | |
# Parse lexicon using provided grammar, then 'write' to xml_temp 'file' | |
pipe(GRAMMAR, | |
ToolboxData.parse, | |
ElementTree, | |
lambda lexicon_tree: lexicon_tree.write(xml_temp, encoding = 'utf-8') | |
) | |
if output_type == "xml": | |
print xml.dom.minidom.parseString(xml_temp.getvalue()).toprettyxml() | |
elif output_type == "json": | |
# Read in from xml_temp 'file' as dict, then dump the dict as JSON to STDOUT | |
pipe(xml_temp.getvalue(), | |
xmltodict.parse, | |
lambda lexicon_dict: json.dump(lexicon_dict['toolbox_data']['record'], sys.stdout, indent = 2) | |
) | |
else: | |
print 'Error: output type should be "xml" or "json"' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
examples:{<text><translation>} | |
headword:{<lx><ps><examples>*} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
\lx bonjour | |
\ps Exclamation | |
\lx hallo | |
\ps Exclamation | |
\text Wir sagen 'hallo' auf Deutsch | |
\translation We say hello in German | |
\lx auto | |
\ps Noun | |
\text Das ist ein Auto | |
\translation This is a car | |
\text Das ist mein Auto | |
\translation This is my car |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment