Created
August 27, 2010 14:41
-
-
Save erikbgithub/553480 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import warnings | |
STATE_CLEAR = 1 | |
STATE_VAR = 2 | |
STATE_SPECIAL = 3 | |
VAR_ALLOWED_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' | |
# special chars make it more usable for non programmers, who probably never heard about \n and so on | |
# more can and will be added later on | |
special_char_map = {'NEWLINE' : '\n'} | |
def is_substring(full, sub): | |
for i in range(len(sub)): | |
if (i >= len(full)) or (full[i] != sub[i]): return False | |
return True | |
def make_token(type, text, children=None): | |
'''creates an element of the AST''' | |
if children is None: | |
children= [] | |
return {'type': type, 'text': text, 'children' : children} | |
def parse_gramar(text, start_var='[', end_var=']', special_marker='#'): | |
'''parses a gramar string and returns token tree | |
for clear text and variables and returns it as token tree | |
example gramar (apostroph is not part of the string): | |
'[HANZI],[PINYIN],[ENGL]#NEWLINE#' | |
this example gramar is able to read a CSV line for a chinese vocable. | |
* between '[' and ']' is a variable name | |
* '#' enclose an area of not parsed clear text or a special character | |
* all other characters are clear text and will only be used to determine | |
which characters | |
''' | |
tokens = [] | |
state = STATE_CLEAR | |
buffer_clear = '' | |
buffer_var = '' | |
buffer_special = '' | |
for char in text: | |
if state is STATE_CLEAR: | |
# handle clear text | |
if char is start_var: | |
if len(buffer_clear) > 0 : tokens.append(make_token('ClearText',buffer_clear)) | |
buffer_clear = '' | |
state = STATE_VAR | |
elif char is special_marker: | |
state = STATE_SPECIAL | |
else: | |
buffer_clear += char | |
state = STATE_CLEAR | |
elif state is STATE_VAR: | |
# handle variable names | |
if char in VAR_ALLOWED_CHARS: | |
buffer_var += char | |
state = STATE_VAR | |
elif char is end_var and len(buffer_var) > 0: | |
tokens.append(make_token('Variable',buffer_var)) | |
buffer_var = '' | |
state = STATE_CLEAR | |
else: | |
warnings.warn('strange character found "%s". will be ignored' % (char,),SyntaxWarning) | |
state = STATE_VAR | |
elif state is STATE_SPECIAL: | |
# handle special characters and the ignore parser case | |
if char is special_marker: | |
if len(buffer_special) > 0: | |
# the following line differentiates 2 cases. | |
# the buffer_special contains either the name of a special character | |
# or is a string that should be ##printed without further formatting. | |
# in the second case the string should not be in the the mapping list | |
# of special characters | |
buffer_clear += special_char_map[buffer_special] if (buffer_special in special_char_map) else buffer_special | |
else: | |
warnings.warn('there are empty markers for a special character.', SyntaxWarning) | |
state = STATE_CLEAR | |
else: | |
buffer_special += char | |
state = STATE_SPECIAL | |
else: | |
raise Exception('state' + state + 'not recognised') | |
if state != STATE_CLEAR: | |
raise Exception('read string is incomplete or has errors. please make sure that all opened Variable definition, special text areas and so on are correctly closed!') | |
if len(buffer_clear) > 0:#cleartext in the end will not be handled by the loop | |
tokens.append(make_token('ClearText',buffer_clear)) | |
#clean up not separated variables | |
last_was_var = False | |
for t in tokens: | |
if t['type'] is 'Variable': | |
if last_was_var: | |
warnings.warn('2 variables must be seperated by at least a komma or white space or something.') | |
tokens.remove(t) | |
else: | |
last_was_var = True | |
else: | |
last_was_var = False | |
return make_token('Gramar', text, tokens) | |
def parse_input(input, gramar_token): | |
STATE_NONE = 1 #the start state | |
STATE_VAR = 2 #handling variable token | |
STATE_CLEAR = 3#handling cleartext token | |
state = STATE_NONE | |
buffer_name = '' | |
idx_input = 0 | |
idx_old = 0 | |
txt_token = make_token('InputText', input) | |
#hack time - make sure there is a trailing newline | |
if (gramar_token['children'][-1]['text'] == '\n') and (not input.endswith('\n')): | |
input += '\n' | |
while True: | |
blocks = [] | |
for token in gramar_token['children']: | |
#print 'handle', token | |
if state is STATE_NONE: | |
if token['type'] is 'Variable': | |
buffer_name = token['text'] | |
state = STATE_VAR | |
elif token['type'] is 'ClearText': | |
if input[idx_input:].startswith(token['text']): | |
idx_input += len(token['text']) | |
state = STATE_CLEAR | |
else: | |
raise Exception('expecting "' + token + '" but got "' + input[idx_input:len(token['text'])] + '"') | |
else: | |
warnings.warn('found a gramar token that I can not handle (will just be ignored): ' + token['type']) | |
state = STATE_NONE | |
elif state is STATE_VAR: | |
if token['type'] is 'Variable': | |
warnings.warn('found to handle 2 variables. I will ignore the second one: ' + token['text']) | |
state = STATE_VAR | |
elif token['type'] is 'ClearText': | |
idx_clear = input[idx_input:].find(token['text']) | |
used_idx = idx_input + idx_clear | |
blocks.append(make_token(buffer_name,input[idx_input:used_idx])) | |
#print {'idx_input':idx_input,'idx_clear':idx_clear,'inputpart':input[idx_input:used_idx],'len(text':len(input[idx_input:used_idx]),'len(tokentxt)':len(token['text'])} | |
idx_input += idx_clear + len(token['text']) | |
state = STATE_CLEAR | |
#if (token['text'] is '\n') and (input[idx_input:used_idx] is ''): | |
# break | |
else: | |
warnings.warn('found a gramar token that I can not handle (will just be ignored): ' + token['type']) | |
state = STATE_VAR | |
elif state is STATE_CLEAR: | |
if token['type'] is 'Variable': | |
buffer_name = token['text'] | |
state = STATE_VAR | |
elif token['type'] is 'ClearText': | |
warnings.warn('found a second ClearText token. I will handle it normally. Please check if Gramar and Results are fine!') | |
if is_substring(input[idx_input:],token['text']): | |
idx_input += len(token['text']) | |
state = STATE_CLEAR | |
else: | |
raise Exception('expecting "' + token + '" but got "' + input[idx_input:len(token['text'])] + '"') | |
else: | |
warnings.warn('found a gramar token that I can not handle (will just be ignored): ' + token['type']) | |
state = STATE_CLEAR | |
#print '--after', (state, buffer_name, idx_input, len(token['text']), input[idx_input:],token, blocks) | |
#print "" | |
txt_token['children'].append(make_token('TextBlock',input[idx_old:idx_input],blocks)) | |
idx_old = idx_input | |
if (idx_input >= len(input)): | |
break | |
return txt_token |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment