Created
June 7, 2010 08:46
-
-
Save azmfaridee/428395 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.parsers.expat | |
import codecs | |
from pprint import pprint | |
stack = [] | |
codestack = [] | |
def_cats = {} | |
def_attrs = {} | |
def_attrs_regex = {} | |
def_lists = {} | |
# these tags consist actual leafs | |
leaf_tags = ['clip', 'lit', 'lit-tag', 'with-param', 'var', 'b', 'list', 'pattern-item'] | |
# more tags to clarify: transfer, section-rules | |
dec_tags = ['cat-item', 'def-cat', 'section-def-cats', 'attr-item', 'def-attr', 'section-def-attrs', 'def-var', 'list-item', 'def-list', 'section-def-vars', 'section-def-lists'] | |
# longest common substring | |
lcs = lambda a, b: lcs(a[:-1], b) if b.find(a) == -1 else a | |
def process_def_attrs(): | |
for def_attr in def_attrs.keys(): | |
# | |
def_attrs_regex[def_attr] = reduce(lambda x, y: x + '|' + y, def_attrs[def_attr]) | |
# FIXME: trying to do some optimization in regex | |
# could do the same way as Jacob's attrItemRegexp function | |
# though according to jacob, regex optimization does not help | |
# that much | |
## common = reduce(lcs, def_attrs[def_attr]) | |
## regex = common | |
## if len(def_attrs[def_attr]) > 1: | |
## for x in def_attrs[def_attr]: | |
## regex = regex + '(' + x[len(common):] + ')|' | |
## print regex[:-1] | |
# 3 handler functions | |
def start_element(name, attrs): | |
stack.append([name, attrs]) | |
if name == 'cat-item': | |
def_cat_id = stack[-2][1]['n'] | |
if def_cat_id not in def_cats: | |
def_cats[def_cat_id] = [] | |
# lemma is OPTIONAL | |
if 'lemma' in attrs.keys(): | |
regex = attrs['lemma'] | |
else: | |
regex = '\w' | |
# tags is REQUIRED, but still for safety we're checking | |
if 'tags' in attrs.keys(): | |
tags = attrs['tags'].split('.') | |
for tag in tags: | |
# FIXME: what to do in case of empty tags? | |
if tag == '': | |
continue | |
if tag == '*': | |
regex = regex + '\\t' | |
continue | |
regex = regex + '<' + tag + '>' | |
else: | |
regex = regex + '\t' | |
def_cats[def_cat_id].append(regex) | |
if name == 'attr-item': | |
def_attr_id = stack[-2][1]['n'] | |
if def_attr_id not in def_attrs: | |
def_attrs[def_attr_id] = [] | |
#print def_attr_id, attrs | |
tags = attrs['tags'].split('.') | |
regex = '' | |
for tag in tags: | |
regex = regex + '<' + tag + '>' | |
def_attrs[def_attr_id].append(regex) | |
if name == 'list-item': | |
def_list_id = stack[-2][1]['n'] | |
if def_list_id not in def_lists: | |
def_lists[def_list_id] = [] | |
def_lists[def_list_id].append(attrs['v']) | |
if name == 'def-macro': | |
code = [] | |
macro_name = attrs['n'] | |
npar = int(attrs['npar']) | |
if name == 'clip': | |
codestack.append([len(stack), handle_clip(name, attrs)]) | |
if name == 'lit-tag': | |
codestack.append([len(stack), handle_lit_tag(name, attrs)]) | |
def handle_lit_tag(name, attrs): | |
code = [] | |
tag = '<' + attrs['v'] + '>' | |
code.append('push\t' + tag) | |
return code | |
def handle_clip(name, attrs): | |
macro_mode = False | |
store_mode = False | |
for item in stack: | |
if item[0] == 'def-macro': | |
macro_mode = True | |
if item[0] == 'let': | |
store_mode = True | |
code = [] | |
# FIXME: create code for lem, lemh, lemq, whole, tags | |
if attrs['part'] not in ['lem', 'lemh', 'lemq', 'whole', 'tags']: | |
# FIXME: has to come up with a better version of the regex | |
regex = reduce(lambda x, y: x + '|' + y, def_attrs[attrs['part']]) | |
code.append('push\t' + attrs['pos']) | |
code.append('push\t' + regex) | |
if store_mode == False: | |
if attrs['side'] == 'sl': code.append('clipsl') | |
else: code.append('cliptl') | |
# print attrs['part'], code | |
return code | |
def end_element(name): | |
pitem = stack[-1] | |
# if this node is not a leaf as well as not from | |
# declaration section | |
if pitem[0] not in leaf_tags and pitem[0] not in dec_tags: | |
depth = len(stack) | |
#print pitem, len(stack), stack | |
#print codestack | |
code_buff = [] | |
# pop all the values from stack which have higher depth than | |
# current depth, then add them to code_buff | |
while len(codestack) > 0 and codestack[-1][0] > depth: | |
code_buff.insert(0, codestack[-1][1]) | |
# print codestack, code_buff | |
codestack.pop(-1) | |
# here comes all the condition section | |
if name == 'and': | |
pass | |
if name == 'or': | |
pass | |
if name == 'not': | |
pass | |
if name == 'equal': | |
# check if caseless | |
try: | |
if pitem[1]['caseless'] == 'yes': | |
code_buff.append('cmpi') | |
except KeyError: | |
code_buff.append('cmp') | |
if name == 'begins-with': | |
pass | |
if name == 'ends-with': | |
pass | |
if name == 'contains-substring': | |
pass | |
if name == 'in': | |
pass | |
if name == 'let': | |
#print 'DEBUG', code_buff | |
pass | |
code = [] | |
# merge code buff into a new code segment | |
for x in code_buff: | |
code.append(x) | |
# insert this new code into code_stack | |
codestack.append([depth, code]) | |
# pop the item from call stack | |
stack.pop(-1) | |
def char_data(data): | |
#print 'Character data:', repr(data) | |
pass | |
if __name__ == '__main__': | |
p = xml.parsers.expat.ParserCreate() | |
p.StartElementHandler = start_element | |
p.EndElementHandler = end_element | |
p.CharacterDataHandler = char_data | |
f = codecs.open('apertium-en-ca.en-ca.t1x', 'r', 'utf-8') | |
s = f.read() | |
p.Parse(s.encode('utf-8')) | |
#process_def_attrs() | |
#print def_cats | |
#print def_attrs | |
#print def_lists | |
pprint(codestack) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment