Created
July 5, 2010 14:58
-
-
Save azmfaridee/464430 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.parsers.expat | |
import codecs | |
from pprint import pprint | |
import sys | |
DEBUG_MODE = False | |
# this stack holds information of the tree | |
stack = [] | |
# genereated codes are held in the this stack and merged when necessary | |
codestack = [] | |
# use this dictionary to keep track of current tags chindren | |
children = {} | |
labels = [] | |
chooseid = 1 | |
whenid = 1 | |
# delayed codes are stored here | |
pending_code = [] | |
def_cats = {} | |
def_attrs = {} | |
# generated regex from input def_attrs | |
def_attrs_regex = {} | |
def_lists = {} | |
variables = {} | |
# mode variables | |
macro_mode = False | |
assign_mode = False | |
store_mode = False | |
# only these tags consist actual leafs | |
leaf_tags = ['clip', 'lit', 'lit-tag', 'with-param', 'var', 'b', 'list', 'pattern-item'] | |
# these tags do not need processging for code generation | |
# more tags to clarify: transfer, section-rules | |
dec_tags = ['cat-item', 'def-cat', 'section-def-cats', 'attr-item', 'def-attr', 'section-def-attrs', 'def-var', 'list-item', 'def-list', 'section-def-vars', 'section-def-lists'] | |
# longest common substring function | |
lcs = lambda a, b: lcs(a[:-1], b) if b.find(a) == -1 else a | |
def process_def_attrs(): | |
for def_attr in def_attrs.keys(): | |
def_attrs_regex[def_attr] = reduce(lambda x, y: x + '|' + y, def_attrs[def_attr]) | |
# FIXME: trying to do some optimization in regex | |
# could do the same way as Jacob's attrItemRegexp function | |
# though according to jacob, regex optimization does not help | |
# that much | |
## common = reduce(lcs, def_attrs[def_attr]) | |
## regex = common | |
## if len(def_attrs[def_attr]) > 1: | |
## for x in def_attrs[def_attr]: | |
## regex = regex + '(' + x[len(common):] + ')|' | |
## print regex[:-1] | |
# 3 handler functions | |
def start_element(name, attrs): | |
stack.append([name, attrs]) | |
# skip the dec_tags | |
if name not in children.keys() and name not in dec_tags: | |
# empty entry for this node | |
children[name] = [] | |
# update this nodes parent with this child nodes value | |
if len(stack) > 1: | |
if [name, attrs] not in children[stack[-2][0]]: | |
children[stack[-2][0]].append([name, attrs]) | |
#print 'XX', name, children | |
if name == 'cat-item': | |
def_cat_id = stack[-2][1]['n'] | |
if def_cat_id not in def_cats: | |
def_cats[def_cat_id] = [] | |
# lemma is OPTIONAL | |
if 'lemma' in attrs.keys(): | |
regex = attrs['lemma'] | |
else: | |
regex = '\w' | |
# tags is REQUIRED, but still for safety we're checking | |
if 'tags' in attrs.keys(): | |
tags = attrs['tags'].split('.') | |
for tag in tags: | |
# FIXME: what to do in case of empty tags? | |
if tag == '': | |
continue | |
if tag == '*': | |
regex = regex + '\\t' | |
continue | |
regex = regex + '<' + tag + '>' | |
else: | |
regex = regex + '\t' | |
def_cats[def_cat_id].append(regex) | |
if name == 'attr-item': | |
def_attr_id = stack[-2][1]['n'] | |
if def_attr_id not in def_attrs: | |
def_attrs[def_attr_id] = [] | |
#print def_attr_id, attrs | |
tags = attrs['tags'].split('.') | |
regex = '' | |
for tag in tags: | |
regex = regex + '<' + tag + '>' | |
def_attrs[def_attr_id].append(regex) | |
if name == 'def-var': | |
vname = attrs['n'] | |
value = attrs.setdefault('v', '') | |
variables[vname] = value | |
if name == 'list-item': | |
def_list_id = stack[-2][1]['n'] | |
if def_list_id not in def_lists: | |
def_lists[def_list_id] = [] | |
def_lists[def_list_id].append(attrs['v']) | |
if name == 'def-macro': | |
macro_mode = True | |
macro_name = attrs['n'] | |
npar = int(attrs['npar']) | |
label = 'macro_ ' + macro_name + '_start' | |
labels.append(label) | |
code = [label + ': nop'] | |
codestack.append([len(stack), 'def-macro', code]) | |
if name == 'choose': | |
## label = u'choose' + str(chooseid) + u'_start' | |
## labels.append(label) | |
pass | |
if name == 'when': | |
label = u'when' + str(whenid) + u'_start' | |
labels.append(label) | |
code = [label + ': nop'] | |
codestack.append([len(stack), 'when', code]) | |
if name == 'clip': | |
code = handle_clip(name, attrs) | |
codestack.append([len(stack), 'clip', code]) | |
if name == 'lit-tag': | |
code = handle_lit_tag(name, attrs) | |
codestack.append([len(stack), 'lit-tag', code]) | |
if name == 'lit': | |
# FIXME: fix the problem with empty lit e.g. <lit v=""/> | |
code = ['push ' + attrs['v']] | |
codestack.append([len(stack), 'lit-tag', code]) | |
def handle_lit_tag(name, attrs): | |
code = [] | |
tag = '<' + attrs['v'] + '>' | |
code.append('push\t' + tag) | |
return code | |
def handle_clip(name, attrs): | |
code = [] | |
# FIXME: create code for lem, lemh, lemq, whole, tags | |
regex = '' | |
if attrs['part'] not in ['lem', 'lemh', 'lemq', 'whole', 'tags']: | |
# FIXME: has to come up with a better version of the regex | |
regex = reduce(lambda x, y: x + '|' + y, def_attrs[attrs['part']]) | |
else: | |
if attrs['part'] == 'lem': | |
regex = 'DUMMY REGEX lem ' | |
elif attrs['part'] == 'lemh': | |
regex = 'DUMMY REGEX lemh' | |
elif attrs['part'] == 'lemq': | |
regex = 'DUMMY REGEX lemq' | |
elif attrs['part'] == 'whole': | |
regex = 'DUMMY REGEX whole' | |
elif attrs['part'] == 'tags ': | |
regex = 'DUMMY REGEX tags' | |
# push pos | |
code.append('push\t' + attrs['pos']) | |
# push regex | |
code.append('push\t' + regex) | |
if 'let' in children.keys(): | |
if len(children['let']) == 1: | |
pass | |
# if attrs['side'] == 'sl': pending_code.append(['let': u'storesl']) | |
# elif attrs['side'] == 'tl': pending_code.append(['let: 'u'storetl'] | |
if store_mode == False: | |
if attrs['side'] == 'sl': code.append(u'clipsl') | |
elif attrs['side'] == 'tl': code.append(u'cliptl') | |
return code | |
def end_element(name): | |
pitem = stack[-1] | |
# skip the def-cat section, only def-macro and rules section | |
if DEBUG_MODE == True and 'def-macro' in zip(*stack)[0]: | |
print 'DEBUG IN: ELEMENT', name, 'ELEMENT', codestack | |
# if this node is not a leaf as well as not from | |
# declaration section | |
if pitem[0] not in leaf_tags and pitem[0] not in dec_tags: | |
depth = len(stack) | |
#print pitem, len(stack), stack | |
#print codestack | |
code_buff = [] | |
# pop all the values from stack which have higher depth than | |
# current depth, then add them to code_buff | |
while len(codestack) > 0 and codestack[-1][0] > depth: | |
# need to do a reverse, don't actually remember why now :( | |
for statement in reversed(codestack[-1][2]): | |
code_buff.insert(0, statement) | |
# print codestack, code_buff | |
codestack.pop(-1) | |
# here comes all the condition section | |
if name == 'and': | |
pass | |
if name == 'or': | |
pass | |
if name == 'not': | |
pass | |
if name == 'equal': | |
# check if caseless | |
try: | |
if pitem[1]['caseless'] == 'yes': | |
code_buff.append(u'cmpi') | |
except KeyError: | |
code_buff.append(u'cmp') | |
if name == 'begins-with': | |
pass | |
if name == 'ends-with': | |
pass | |
if name == 'contains-substring': | |
pass | |
if name == 'in': | |
pass | |
if name == 'def-macro': | |
label = 'macro_' + pitem[1]['n'] + '_end' | |
# end of marco, return | |
code_buff.append(label + ': ret') | |
labels.append(label) | |
macro_mode = False | |
if name == 'choose': | |
## global chooseid | |
## label = u'choose' + str(chooseid) + u'_end' | |
## labels.append(label) | |
## chooseid += 1 | |
pass | |
if name == 'when': | |
global whenid | |
label = u'when' + str(whenid) + u'_end' | |
labels.append(label) | |
code_buff.append(label + ': nop') | |
whenid += 1 | |
if name == 'test': | |
code_buff.append(u'jnz when' + str(whenid) + '_end') | |
if name == 'let': | |
zipped = zip(*children[name]) | |
container = zipped[0][0] | |
value = zipped[0][1] | |
# FIXME: do we need to check where the value is lit or lit-tag? | |
if container == 'clip': | |
try: | |
if children[name][0][1]['side'] == 'sl': | |
# storesl is most probably not used | |
code_buff.append(u'storesl') | |
elif children[name][0][1]['side'] == 'tl': | |
code_buff.append(u'storetl') | |
except ValueError: | |
pass | |
pass | |
elif container == 'var': | |
pass | |
if name == 'def-macro': | |
macro_mode = False | |
# merge code buff into a new code segment | |
code = [] | |
for x in code_buff: | |
code.append(x) | |
# insert this new code into code_stack | |
codestack.append([depth, name, code]) | |
if DEBUG_MODE == True and 'def-macro' in zip(*stack)[0]: | |
print 'DEBUG OUT: ELEMENT', name, 'CODESTACK', codestack | |
# pop the items children list | |
children.pop(name, None) | |
#print 'YY', name, children | |
# pop the item from call stack | |
stack.pop(-1) | |
def char_data(data): | |
#print 'Character data:', repr(data) | |
pass | |
if __name__ == '__main__': | |
p = xml.parsers.expat.ParserCreate() | |
p.StartElementHandler = start_element | |
p.EndElementHandler = end_element | |
p.CharacterDataHandler = char_data | |
# f = codecs.open('apertium-en-ca.en-ca.t1x', 'r', 'utf-8') | |
f = codecs.open('input-compiler/set3.t1x', 'r', 'utf-8') | |
s = f.read() | |
p.Parse(s.encode('utf-8')) | |
#process_def_attrs() | |
#print def_cats | |
#print def_attrs | |
#print def_lists | |
#pprint(codestack) | |
#print labels | |
for code in codestack[0][2]: | |
print code |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment