erikbgithub · August 27, 2010 14:41
diff --git a/gistfile1.py b/gistfile1.py
 import warnings

 STATE_CLEAR = 1
 STATE_VAR = 2
 STATE_SPECIAL = 3
 VAR_ALLOWED_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 # special chars make it more usable for non programmers, who probably never heard about \n and so on
 # more can and will be added later on
 special_char_map = {'NEWLINE' : '\n'}

 def is_substring(full, sub):
  for i in range(len(sub)):
    if (i >= len(full)) or (full[i] != sub[i]): return False
  return True


 def make_token(type, text, children=None):
  '''creates an element of the AST'''
  if children is None:
    children= []
  return {'type': type, 'text': text, 'children' : children}

 def parse_gramar(text, start_var='[', end_var=']', special_marker='#'):
  '''parses a gramar string and returns token tree

     for clear text and variables and returns it as token tree

     example gramar (apostroph is not part of the string):
     '[HANZI],[PINYIN],[ENGL]#NEWLINE#'
     
     this example gramar is able to read a CSV line for a chinese vocable.
     * between '[' and ']' is a variable name
     * '#' enclose an area of not parsed clear text or a special character
     * all other characters are clear text and will only be used to determine
       which characters
  '''
  tokens = []
  state = STATE_CLEAR
  buffer_clear = ''
  buffer_var = ''
  buffer_special = ''
  for char in text:
    if state is STATE_CLEAR:
    # handle clear text
      if char is start_var:
        if len(buffer_clear) > 0 : tokens.append(make_token('ClearText',buffer_clear))
        buffer_clear = ''
        state = STATE_VAR
      elif char is special_marker: 
        state = STATE_SPECIAL
      else:
        buffer_clear += char
        state = STATE_CLEAR
    elif state is STATE_VAR:
    # handle variable names
      if char in VAR_ALLOWED_CHARS:
        buffer_var += char
        state = STATE_VAR
      elif char is end_var and len(buffer_var) > 0:
        tokens.append(make_token('Variable',buffer_var))
        buffer_var = ''
        state = STATE_CLEAR
      else:
        warnings.warn('strange character found "%s". will be ignored' % (char,),SyntaxWarning)
        state = STATE_VAR
    elif state is STATE_SPECIAL:
    # handle special characters and the ignore parser case
      if char is special_marker:
        if len(buffer_special) > 0:
          # the following line differentiates 2 cases.
          # the buffer_special contains either the name of a special character
          # or is a string that should be ##printed without further formatting.
          # in the second case the string should not be in the the mapping list
          # of special characters
          buffer_clear += special_char_map[buffer_special] if (buffer_special in special_char_map) else buffer_special
        else:
          warnings.warn('there are empty markers for a special character.', SyntaxWarning)
        state = STATE_CLEAR
      else:
        buffer_special += char
        state = STATE_SPECIAL
    else:
      raise Exception('state' + state + 'not recognised')

  if state != STATE_CLEAR:
    raise Exception('read string is incomplete or has errors. please make sure that all opened Variable definition, special text areas and so on are correctly closed!')
  if len(buffer_clear) > 0:#cleartext in the end will not be handled by the loop
    tokens.append(make_token('ClearText',buffer_clear))
  #clean up not separated variables
  last_was_var = False
  for t in tokens:
    if t['type'] is 'Variable':
      if last_was_var:
        warnings.warn('2 variables must be seperated by at least a komma or white space or something.')
        tokens.remove(t)
      else:
        last_was_var = True
    else:
      last_was_var = False
  return make_token('Gramar', text, tokens)


 def parse_input(input, gramar_token):
  STATE_NONE = 1 #the start state
  STATE_VAR = 2  #handling variable token
  STATE_CLEAR = 3#handling cleartext token
  state = STATE_NONE

  buffer_name = ''
  
  idx_input = 0
  idx_old = 0
  txt_token = make_token('InputText', input)

  #hack time - make sure there is a trailing newline
  if (gramar_token['children'][-1]['text'] == '\n') and (not input.endswith('\n')):
    input += '\n'

  while True:
    blocks = []
    for token in gramar_token['children']:
      #print 'handle', token
      if state is STATE_NONE:
        if token['type'] is 'Variable':
          buffer_name = token['text']
          state = STATE_VAR
        elif token['type'] is 'ClearText':
          if input[idx_input:].startswith(token['text']):
            idx_input += len(token['text'])
            state = STATE_CLEAR
          else:
            raise Exception('expecting "' + token + '" but got "' + input[idx_input:len(token['text'])] + '"')
        else:
          warnings.warn('found a gramar token that I can not handle (will just be ignored): ' + token['type'])
          state = STATE_NONE
      elif state is STATE_VAR:
        if token['type'] is 'Variable':
          warnings.warn('found to handle 2 variables. I will ignore the second one: ' + token['text'])
          state = STATE_VAR
        elif token['type'] is 'ClearText':
            idx_clear = input[idx_input:].find(token['text'])
            used_idx = idx_input + idx_clear
            blocks.append(make_token(buffer_name,input[idx_input:used_idx]))
            #print {'idx_input':idx_input,'idx_clear':idx_clear,'inputpart':input[idx_input:used_idx],'len(text':len(input[idx_input:used_idx]),'len(tokentxt)':len(token['text'])}
            idx_input += idx_clear + len(token['text'])
            state = STATE_CLEAR
            #if (token['text'] is '\n') and (input[idx_input:used_idx] is ''):
            #  break
        else:
          warnings.warn('found a gramar token that I can not handle (will just be ignored): ' + token['type'])
          state = STATE_VAR
      elif state is STATE_CLEAR:
        if token['type'] is 'Variable':
          buffer_name = token['text']
          state = STATE_VAR
        elif token['type'] is 'ClearText':
          warnings.warn('found a second ClearText token. I will handle it normally. Please check if Gramar and Results are fine!')
          if is_substring(input[idx_input:],token['text']):
            idx_input += len(token['text'])
            state = STATE_CLEAR
          else:
            raise Exception('expecting "' + token + '" but got "' + input[idx_input:len(token['text'])] + '"')
        else:
          warnings.warn('found a gramar token that I can not handle (will just be ignored): ' + token['type'])
          state = STATE_CLEAR
      #print '--after', (state, buffer_name, idx_input, len(token['text']), input[idx_input:],token, blocks)
      #print ""
    txt_token['children'].append(make_token('TextBlock',input[idx_old:idx_input],blocks))
    idx_old = idx_input
    if (idx_input >= len(input)):
      break
  return txt_token
	import warnings

	STATE_CLEAR = 1
	STATE_VAR = 2
	STATE_SPECIAL = 3
	VAR_ALLOWED_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
	# special chars make it more usable for non programmers, who probably never heard about \n and so on
	# more can and will be added later on
	special_char_map = {'NEWLINE' : '\n'}

	def is_substring(full, sub):
	for i in range(len(sub)):
	if (i >= len(full)) or (full[i] != sub[i]): return False
	return True


	def make_token(type, text, children=None):
	'''creates an element of the AST'''
	if children is None:
	children= []
	return {'type': type, 'text': text, 'children' : children}

	def parse_gramar(text, start_var='[', end_var=']', special_marker='#'):
	'''parses a gramar string and returns token tree

	for clear text and variables and returns it as token tree

	example gramar (apostroph is not part of the string):
	'[HANZI],[PINYIN],[ENGL]#NEWLINE#'

	this example gramar is able to read a CSV line for a chinese vocable.
	* between '[' and ']' is a variable name
	* '#' enclose an area of not parsed clear text or a special character
	* all other characters are clear text and will only be used to determine
	which characters
	'''
	tokens = []
	state = STATE_CLEAR
	buffer_clear = ''
	buffer_var = ''
	buffer_special = ''
	for char in text:
	if state is STATE_CLEAR:
	# handle clear text
	if char is start_var:
	if len(buffer_clear) > 0 : tokens.append(make_token('ClearText',buffer_clear))
	buffer_clear = ''
	state = STATE_VAR
	elif char is special_marker:
	state = STATE_SPECIAL
	else:
	buffer_clear += char
	state = STATE_CLEAR
	elif state is STATE_VAR:
	# handle variable names
	if char in VAR_ALLOWED_CHARS:
	buffer_var += char
	state = STATE_VAR
	elif char is end_var and len(buffer_var) > 0:
	tokens.append(make_token('Variable',buffer_var))
	buffer_var = ''
	state = STATE_CLEAR
	else:
	warnings.warn('strange character found "%s". will be ignored' % (char,),SyntaxWarning)
	state = STATE_VAR
	elif state is STATE_SPECIAL:
	# handle special characters and the ignore parser case
	if char is special_marker:
	if len(buffer_special) > 0:
	# the following line differentiates 2 cases.
	# the buffer_special contains either the name of a special character
	# or is a string that should be ##printed without further formatting.
	# in the second case the string should not be in the the mapping list
	# of special characters
	buffer_clear += special_char_map[buffer_special] if (buffer_special in special_char_map) else buffer_special
	else:
	warnings.warn('there are empty markers for a special character.', SyntaxWarning)
	state = STATE_CLEAR
	else:
	buffer_special += char
	state = STATE_SPECIAL
	else:
	raise Exception('state' + state + 'not recognised')

	if state != STATE_CLEAR:
	raise Exception('read string is incomplete or has errors. please make sure that all opened Variable definition, special text areas and so on are correctly closed!')
	if len(buffer_clear) > 0:#cleartext in the end will not be handled by the loop
	tokens.append(make_token('ClearText',buffer_clear))
	#clean up not separated variables
	last_was_var = False
	for t in tokens:
	if t['type'] is 'Variable':
	if last_was_var:
	warnings.warn('2 variables must be seperated by at least a komma or white space or something.')
	tokens.remove(t)
	else:
	last_was_var = True
	else:
	last_was_var = False
	return make_token('Gramar', text, tokens)


	def parse_input(input, gramar_token):
	STATE_NONE = 1 #the start state
	STATE_VAR = 2 #handling variable token
	STATE_CLEAR = 3#handling cleartext token
	state = STATE_NONE

	buffer_name = ''

	idx_input = 0
	idx_old = 0
	txt_token = make_token('InputText', input)

	#hack time - make sure there is a trailing newline
	if (gramar_token['children'][-1]['text'] == '\n') and (not input.endswith('\n')):
	input += '\n'

	while True:
	blocks = []
	for token in gramar_token['children']:
	#print 'handle', token
	if state is STATE_NONE:
	if token['type'] is 'Variable':
	buffer_name = token['text']
	state = STATE_VAR
	elif token['type'] is 'ClearText':
	if input[idx_input:].startswith(token['text']):
	idx_input += len(token['text'])
	state = STATE_CLEAR
	else:
	raise Exception('expecting "' + token + '" but got "' + input[idx_input:len(token['text'])] + '"')
	else:
	warnings.warn('found a gramar token that I can not handle (will just be ignored): ' + token['type'])
	state = STATE_NONE
	elif state is STATE_VAR:
	if token['type'] is 'Variable':
	warnings.warn('found to handle 2 variables. I will ignore the second one: ' + token['text'])
	state = STATE_VAR
	elif token['type'] is 'ClearText':
	idx_clear = input[idx_input:].find(token['text'])
	used_idx = idx_input + idx_clear
	blocks.append(make_token(buffer_name,input[idx_input:used_idx]))
	#print {'idx_input':idx_input,'idx_clear':idx_clear,'inputpart':input[idx_input:used_idx],'len(text':len(input[idx_input:used_idx]),'len(tokentxt)':len(token['text'])}
	idx_input += idx_clear + len(token['text'])
	state = STATE_CLEAR
	#if (token['text'] is '\n') and (input[idx_input:used_idx] is ''):
	# break
	else:
	warnings.warn('found a gramar token that I can not handle (will just be ignored): ' + token['type'])
	state = STATE_VAR
	elif state is STATE_CLEAR:
	if token['type'] is 'Variable':
	buffer_name = token['text']
	state = STATE_VAR
	elif token['type'] is 'ClearText':
	warnings.warn('found a second ClearText token. I will handle it normally. Please check if Gramar and Results are fine!')
	if is_substring(input[idx_input:],token['text']):
	idx_input += len(token['text'])
	state = STATE_CLEAR
	else:
	raise Exception('expecting "' + token + '" but got "' + input[idx_input:len(token['text'])] + '"')
	else:
	warnings.warn('found a gramar token that I can not handle (will just be ignored): ' + token['type'])
	state = STATE_CLEAR
	#print '--after', (state, buffer_name, idx_input, len(token['text']), input[idx_input:],token, blocks)
	#print ""
	txt_token['children'].append(make_token('TextBlock',input[idx_old:idx_input],blocks))
	idx_old = idx_input
	if (idx_input >= len(input)):
	break
	return txt_token