Created
May 24, 2012 03:04
-
-
Save ehaliewicz/2779206 to your computer and use it in GitHub Desktop.
Python FSM parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# process() kind of parses Tadoku entries | |
# Character Classes | |
# 0. Space | |
# 1. Hash | |
# 2. Semicolon | |
# 3. Number | |
# 4. Other | |
# states | |
# 0 Start (eats spaces until (number -> 1)) | |
# 1 Number (takes spaces until (#media -> 2)) | |
# 2 Media (takes spaces until (#times -> 3) (; -> 0)) | |
# 3 Times (takes spaces until (; - > 0)) | |
# 4 After semicolon (takes everything but hashes and numbers) until (number -> 1) | |
# [medium-dec/times-read/number-of/comment, start-pos, end-pos] | |
string = "" | |
position = 0 | |
state = 0 | |
cur_collection = ["", 0, 0] | |
collected = [ ["medium", 0, 0] ] #type, start-pos, end-pos | |
def current_char(): | |
global string, position | |
return string[position]; | |
def next_char(): | |
global string, position | |
return string[position+1] if position < len(string) else False; | |
def hash_p(): | |
global string, position | |
return True if string.startswith("#", position) else False; | |
def input_class(c): | |
if c == ';': | |
return 2 | |
elif c == '#': | |
return 1 | |
elif c == ' ' or c == '\t' or c == '\n' or c == '\r': | |
return 0 | |
elif ('0' <= c <= '9'): | |
return 3 | |
else: | |
return 4 | |
def skip(): | |
global position | |
position = position+1 | |
def error(): | |
global state, string, position | |
print('Error, skipping misplaced character: {} in state: {}'.format(string[position], state)) | |
skip() | |
def declaration_p(type_container, prelude_func=True, prelude_lng=0): | |
global string, position | |
if prelude_func: | |
xpos = string.find(" ", position) | |
if (xpos == -1): | |
xpos = len(string) | |
if string[position+prelude_lng:xpos] in type_container: | |
return xpos | |
return False | |
def times_p(): | |
return declaration_p(times, hash_p(), 1) | |
def media_p(): | |
return declaration_p(media, hash_p(), 1) | |
def read_num(): | |
global string, position, cur_collection, collected | |
endpos = string.find(" ", position) | |
if endpos == -1: | |
endpos = len(string) | |
substring = string[position:endpos] | |
if substring.isdigit(): | |
cur_collection = ["number", position, endpos] | |
collected.append(cur_collection) | |
cur_collection = ["", 0, 0] | |
position = endpos | |
else: | |
print("Skipping malformed number {}".format(substring)) | |
position = endpos | |
return | |
def read_media(): | |
global string, position, cur_collection, collected | |
endpos = media_p() | |
if endpos > -1: | |
cur_collection = ["media", position, endpos] | |
collected.append(cur_collection) | |
position = endpos | |
else: | |
print(endpos) | |
print("Error, malformed media declaration in {}".format(string[position:])) | |
next_sem = string.find(";", position) | |
if next_sem: | |
position = next_sem | |
else: | |
position = string.find(" ", position) | |
return | |
def read_times(): | |
global string, position, cur_collection, collected | |
endpos = times_p() | |
if endpos: | |
cur_collection = ["times", position, endpos] | |
collected.append(cur_collection) | |
position=endpos | |
else: | |
print("Error, malformed times declaration") | |
position = string.find(" ", position) | |
return | |
times = ["first", "second", "third", "fourth", "fifth"] | |
media = [ 'book', 'dr', 'manga', 'fullgame', 'game', 'lyric', 'subs', 'news', 'nico', 'sentences'] | |
states = ["Start", "Number", "Media", "Times", "Semicolon", "End"] | |
# Space Hash Semicolon Number Other | |
sm = [ [skip, 0, error, 0, error, 0, read_num, 1, error, 0], # Start | |
[skip, 1, read_media, 2, error, 1, error, 1, error, 1], # Number | |
[skip, 2, read_times, 3, skip, 0, error, 2, error, 2], # Media | |
[skip, 3, error, 3, skip, 0, error, 3, error, 3], # Times | |
[skip, 4, skip, 4, skip, 4, read_num, 1, skip, 4], # After semicolon | |
] | |
def process(incoming): | |
global position, string, collected, state | |
string = incoming | |
position = 0 | |
state = 0 | |
collected = [] | |
while position < len(incoming): | |
char_class = input_class(current_char()) | |
statefunc = sm[state][2*char_class] | |
print("Character: {}, Class: {}, State: {}, State Function: {}".format(current_char(), char_class, state, statefunc)) | |
statefunc() | |
state = sm[state][1+2*char_class] | |
if (state > 3): break; | |
return collected |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment