Created
June 9, 2015 04:49
-
-
Save hghwng/dd42653e04028861ebb7 to your computer and use it in GitHub Desktop.
FSA for Speech and Language Processing 2, 2-3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python3 | |
g_dbg = False | |
g_rule = {} | |
def print_state(state, text): | |
print('\t[', end='') | |
for i in state['pre']: | |
print('{} -> '.format(i), end='') | |
print('{}] '.format(state['tag']), end='') | |
for i in range(state['index'], len(text)): | |
print(text[i] + ' ', end='') | |
print('') | |
def search(text, rules): | |
global g_dbg | |
text = text.split(' ') | |
states = [] | |
if g_dbg: | |
states.append({'index': 0, 'tag': 'start', 'pre': []}) | |
else: | |
states.append({'index': 0, 'tag': 'start'}) | |
while len(states) > 0: | |
state = states.pop() | |
# test whether the destination state is found | |
if state['tag'] == 'end': | |
if state['index'] == len(text): | |
# only succeeds on full match | |
return True | |
else: | |
continue | |
if state['index'] < 0 and state['index'] >= len(text): | |
continue | |
# output debug message on demand | |
if g_dbg: | |
print_state(state, text) | |
# get transitions of current state | |
if state['tag'] not in rules: | |
continue | |
else: | |
rule = rules[state['tag']] | |
to_match = text[state['index']] | |
for transition in rule: | |
new_state = {'index': -1, 'tag': transition[1]} | |
if g_dbg: | |
new_state['pre'] = list(state['pre']) | |
new_state['pre'].append(state['tag']) | |
for candedate in transition[0]: | |
if candedate == '': | |
# epsilon transition | |
new_state['index'] = state['index'] | |
elif candedate == to_match: | |
new_state['index'] = state['index'] + 1 | |
if new_state['index'] != -1 and new_state not in states: | |
states.insert(0, new_state) | |
return False | |
def build_rule(): | |
one_plus = ['one', 'two', 'three', 'four', 'five', | |
'six', 'seven', 'eight', 'nine'] | |
two_plus = ['two', 'three', 'four', 'five', | |
'six', 'seven', 'eight', 'nine'] | |
ten_plus = ['eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', | |
'sixteen', 'seventeen', 'eighteen', 'nineteen'] | |
xx_ty = ['ten', 'twenty', 'thirty', 'forty', 'fifty', | |
'sixty', 'seventy', 'eighty', 'ninety'] | |
no_suffix = [] | |
no_suffix.extend(one_plus) | |
no_suffix.extend(ten_plus) | |
has_suffix = xx_ty | |
''' | |
abbr: | |
c : cent | |
d : dollar | |
sg : singular | |
sg? : singular or plural | |
pl : plural | |
hs : has suffix | |
ns : no suffix | |
stg : stage | |
number less than 100: | |
$no_suffix | ($has_suffix? $one_plus) | |
''' | |
rules = {} | |
rules['start'] = [ | |
[[''], 'd stg0'], # one thousand ... | |
[[''], 'd stg1'], # five hundred ... | |
[[''], 'd stg2 sg?'], # fifty one / one | |
[[''], 'c'], # fifty one / one | |
] | |
# | |
# stage 0: number for thousand | |
# | |
rules['d stg0'] = [ | |
[no_suffix, 'd stg0 ns'], # eleven thousand ... | |
[has_suffix, 'd stg0 hs'], # fifty thousand ... | |
] | |
rules['d stg0 hs'] = [ | |
[[''], 'd stg0 ns'], # fifty thousand ... | |
[one_plus, 'd stg0 ns'], # fifty one thousand ... | |
] | |
rules['d stg0 ns'] = [ | |
[['thousand'], 'd stg2 pl'], # one thousand eighty ... | |
[['thousand'], 'd stg1'], # one thousand five hundred ... | |
] | |
# | |
# stage 1: number for hundred | |
# | |
rules['d stg1'] = [ | |
[one_plus, 'd stg1 ns'], # one hundred ... | |
] | |
rules['d stg1 ns'] = [ | |
[['hundred'], 'd stg2 pl'], # one hundred ... | |
] | |
# | |
# stage 2 (plural): number less than 100 (for confirmed plural) | |
# | |
rules['d stg2 pl'] = [ | |
[[''], 'd pl'], # one hundred | |
[no_suffix, 'd pl'], # one hundred eleven | |
[has_suffix, 'd stg2 hs'], # one hundred fifty ... | |
] | |
rules['d stg2 hs'] = [ | |
[[''], 'd pl'], # one hundred fifty | |
[one_plus, 'd pl'], # one hundred fifty one | |
] | |
# | |
# stage 2 (plural or singural): number less than 100 (detect form) | |
# | |
rules['d stg2 sg?'] = [ | |
[['one'], 'd sg'], # one | |
[two_plus, 'd pl'], # five | |
[ten_plus, 'd pl'], # eleven | |
[has_suffix, 'd stg2 hs'], # fifty ... | |
] | |
# | |
# accept 'doller' | |
# | |
rules['d pl'] = [ | |
[['dollars'], 'end'], | |
[['dollars'], 'c'], | |
] | |
rules['d sg'] = [ | |
[['dollar'], 'end'], | |
[['dollar'], 'c'] | |
] | |
# | |
# recognize conts | |
# | |
rules['c'] = [ | |
[[''], 'end'], # probably no cent included | |
[['one'], 'c sg'], | |
[two_plus, 'c pl'], | |
[ten_plus, 'c pl'], | |
[has_suffix, 'c hs'], | |
] | |
rules['c hs'] = [ | |
[[''], 'c pl'], | |
[one_plus, 'c pl'], | |
] | |
rules['c pl'] = [ | |
[['cents'], 'end'], | |
] | |
rules['c sg'] = [ | |
[['cent'], 'end'], | |
] | |
return rules | |
def test_one(answer, text): | |
global g_rule | |
print('Testing "{}": '.format(text), end='') | |
if search(text, g_rule) == answer: | |
print('pass') | |
else: | |
print('fail. {} expected'.format(answer)) | |
def test_all(): | |
# 11, 12, ... | |
test_one(True, 'eleven dollars') | |
test_one(True, 'nineteen dollars') | |
# 20, 30, ... | |
test_one(True, 'fifty dollars') | |
test_one(True, 'seventy dollars') | |
test_one(False, 'fifty nineteen dollars') | |
test_one(False, 'nineteen fifty dollars') | |
test_one(True, 'fifty six dollars') | |
# 100 - 999 | |
test_one(True, 'one hundred dollars') | |
test_one(True, 'two hundred six dollars') | |
test_one(True, 'nine hundred ninety six dollars') | |
# 1,000 - 100,000 | |
test_one(True, 'twenty thousand sixty six dollars') | |
test_one(True, 'sixty one thousand five hundred forty six dollars') | |
test_one(False, 'thousand five hundred forty six dollars one cent') | |
# With 'cent' | |
test_one(True, 'one thousand five hundred forty six dollars ' | |
'twenty one cents') | |
test_one(True, 'one thousand five hundred forty six dollars one cent') | |
test_one(False, 'one dollars one cent') | |
test_one(False, 'one dollar one cents') | |
test_one(False, 'one dollar one thousand cents') | |
# Plural vs singular | |
test_one(True, 'one dollar') | |
test_one(False, 'one dollars') | |
test_one(True, 'six cents') | |
test_one(False, 'six cent') | |
def main(): | |
global g_rule | |
g_rule = build_rule() | |
global g_dbg | |
if False: | |
g_dbg = True | |
test_one(False, 'one dollar one cents') | |
else: | |
test_all() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment