Skip to content

Instantly share code, notes, and snippets.

@erikbgithub
Created August 27, 2010 14:41
Show Gist options
  • Save erikbgithub/553480 to your computer and use it in GitHub Desktop.
Save erikbgithub/553480 to your computer and use it in GitHub Desktop.
import warnings
STATE_CLEAR = 1
STATE_VAR = 2
STATE_SPECIAL = 3
VAR_ALLOWED_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
# special chars make it more usable for non programmers, who probably never heard about \n and so on
# more can and will be added later on
special_char_map = {'NEWLINE' : '\n'}
def is_substring(full, sub):
for i in range(len(sub)):
if (i >= len(full)) or (full[i] != sub[i]): return False
return True
def make_token(type, text, children=None):
'''creates an element of the AST'''
if children is None:
children= []
return {'type': type, 'text': text, 'children' : children}
def parse_gramar(text, start_var='[', end_var=']', special_marker='#'):
'''parses a gramar string and returns token tree
for clear text and variables and returns it as token tree
example gramar (apostroph is not part of the string):
'[HANZI],[PINYIN],[ENGL]#NEWLINE#'
this example gramar is able to read a CSV line for a chinese vocable.
* between '[' and ']' is a variable name
* '#' enclose an area of not parsed clear text or a special character
* all other characters are clear text and will only be used to determine
which characters
'''
tokens = []
state = STATE_CLEAR
buffer_clear = ''
buffer_var = ''
buffer_special = ''
for char in text:
if state is STATE_CLEAR:
# handle clear text
if char is start_var:
if len(buffer_clear) > 0 : tokens.append(make_token('ClearText',buffer_clear))
buffer_clear = ''
state = STATE_VAR
elif char is special_marker:
state = STATE_SPECIAL
else:
buffer_clear += char
state = STATE_CLEAR
elif state is STATE_VAR:
# handle variable names
if char in VAR_ALLOWED_CHARS:
buffer_var += char
state = STATE_VAR
elif char is end_var and len(buffer_var) > 0:
tokens.append(make_token('Variable',buffer_var))
buffer_var = ''
state = STATE_CLEAR
else:
warnings.warn('strange character found "%s". will be ignored' % (char,),SyntaxWarning)
state = STATE_VAR
elif state is STATE_SPECIAL:
# handle special characters and the ignore parser case
if char is special_marker:
if len(buffer_special) > 0:
# the following line differentiates 2 cases.
# the buffer_special contains either the name of a special character
# or is a string that should be ##printed without further formatting.
# in the second case the string should not be in the the mapping list
# of special characters
buffer_clear += special_char_map[buffer_special] if (buffer_special in special_char_map) else buffer_special
else:
warnings.warn('there are empty markers for a special character.', SyntaxWarning)
state = STATE_CLEAR
else:
buffer_special += char
state = STATE_SPECIAL
else:
raise Exception('state' + state + 'not recognised')
if state != STATE_CLEAR:
raise Exception('read string is incomplete or has errors. please make sure that all opened Variable definition, special text areas and so on are correctly closed!')
if len(buffer_clear) > 0:#cleartext in the end will not be handled by the loop
tokens.append(make_token('ClearText',buffer_clear))
#clean up not separated variables
last_was_var = False
for t in tokens:
if t['type'] is 'Variable':
if last_was_var:
warnings.warn('2 variables must be seperated by at least a komma or white space or something.')
tokens.remove(t)
else:
last_was_var = True
else:
last_was_var = False
return make_token('Gramar', text, tokens)
def parse_input(input, gramar_token):
STATE_NONE = 1 #the start state
STATE_VAR = 2 #handling variable token
STATE_CLEAR = 3#handling cleartext token
state = STATE_NONE
buffer_name = ''
idx_input = 0
idx_old = 0
txt_token = make_token('InputText', input)
#hack time - make sure there is a trailing newline
if (gramar_token['children'][-1]['text'] == '\n') and (not input.endswith('\n')):
input += '\n'
while True:
blocks = []
for token in gramar_token['children']:
#print 'handle', token
if state is STATE_NONE:
if token['type'] is 'Variable':
buffer_name = token['text']
state = STATE_VAR
elif token['type'] is 'ClearText':
if input[idx_input:].startswith(token['text']):
idx_input += len(token['text'])
state = STATE_CLEAR
else:
raise Exception('expecting "' + token + '" but got "' + input[idx_input:len(token['text'])] + '"')
else:
warnings.warn('found a gramar token that I can not handle (will just be ignored): ' + token['type'])
state = STATE_NONE
elif state is STATE_VAR:
if token['type'] is 'Variable':
warnings.warn('found to handle 2 variables. I will ignore the second one: ' + token['text'])
state = STATE_VAR
elif token['type'] is 'ClearText':
idx_clear = input[idx_input:].find(token['text'])
used_idx = idx_input + idx_clear
blocks.append(make_token(buffer_name,input[idx_input:used_idx]))
#print {'idx_input':idx_input,'idx_clear':idx_clear,'inputpart':input[idx_input:used_idx],'len(text':len(input[idx_input:used_idx]),'len(tokentxt)':len(token['text'])}
idx_input += idx_clear + len(token['text'])
state = STATE_CLEAR
#if (token['text'] is '\n') and (input[idx_input:used_idx] is ''):
# break
else:
warnings.warn('found a gramar token that I can not handle (will just be ignored): ' + token['type'])
state = STATE_VAR
elif state is STATE_CLEAR:
if token['type'] is 'Variable':
buffer_name = token['text']
state = STATE_VAR
elif token['type'] is 'ClearText':
warnings.warn('found a second ClearText token. I will handle it normally. Please check if Gramar and Results are fine!')
if is_substring(input[idx_input:],token['text']):
idx_input += len(token['text'])
state = STATE_CLEAR
else:
raise Exception('expecting "' + token + '" but got "' + input[idx_input:len(token['text'])] + '"')
else:
warnings.warn('found a gramar token that I can not handle (will just be ignored): ' + token['type'])
state = STATE_CLEAR
#print '--after', (state, buffer_name, idx_input, len(token['text']), input[idx_input:],token, blocks)
#print ""
txt_token['children'].append(make_token('TextBlock',input[idx_old:idx_input],blocks))
idx_old = idx_input
if (idx_input >= len(input)):
break
return txt_token
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment