Created
June 11, 2017 01:11
-
-
Save glenrobertson/0ad29536ac050d86d739214541c4701b to your computer and use it in GitHub Desktop.
Convert transcript to list of digits
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import re | |
words_to_digits = { | |
'zero': 0, | |
'one': 1, | |
'two': 2, | |
'three': 3, | |
'four': 4, | |
'five': 5, | |
'six': 6, | |
'seven': 7, | |
'eight': 8, | |
'nine': 9, | |
'ten': 10, | |
'eleven': 11, | |
'twelve': 12, | |
'thirteen': 13, | |
'fourteen': 14, | |
'fifteen': 15, | |
'sixteen': 16, | |
'seventeen': 17, | |
'eighteen': 18, | |
'nineteen': 19, | |
# homonyms | |
'for': 4, | |
'oh': 0, | |
'too': 2, | |
'to': 2, | |
} | |
prefixes = { | |
'twenty': 20, | |
'thirty': 30, | |
'forty': 40, | |
'fifty': 50, | |
'sixty': 60, | |
'seventy': 70, | |
'eighty': 80, | |
'ninety': 90, | |
} | |
suffixes = { | |
'hundred': 100, | |
'thousand': 1000, | |
} | |
repeaters = { | |
'double': 2, | |
'triple': 3, | |
} | |
def extract_digits(transcript): | |
""" | |
:param transcript str: a text transcript | |
Given a transcript, try to extract a series of digits. | |
One example usage is a transcript of someone speaking a phone number: | |
They may say: | |
1) "four one five eight double two five thousand" | |
Result: 4158225000 | |
or | |
2) "triple five eight for too two two thirty" | |
Result: 5558422230 | |
Words are mapped to digits. | |
Prefixes are stored to be added to any next occuring digit. | |
Suffixes are multiplied against any previous digit. | |
Repeaters are stored to repeat the next occuring digit. | |
""" | |
digits = [] | |
current_repeater = None | |
current_prefix = None | |
words = transcript.split(' ') | |
for word in words: | |
digit = words_to_digits.get(word) | |
prefix = prefixes.get(word) | |
suffix = suffixes.get(word) | |
repeater = repeaters.get(word) | |
if repeater is not None: | |
current_repeater = repeater | |
elif suffix is not None: | |
if len(digits) > 0: | |
digits[-1] *= suffix | |
elif prefix is not None: | |
if current_prefix is not None: | |
digits.append(current_prefix) | |
current_prefix = prefix | |
elif digit is not None: | |
if current_repeater is not None: | |
digits.extend(current_repeater * [digit]) | |
current_repeater = None | |
elif current_prefix is not None: | |
digits.append(current_prefix + digit) | |
current_prefix = None | |
else: | |
digits.append(digit) | |
if current_prefix is not None: | |
digits.append(current_prefix) | |
return digits | |
""" | |
Tests | |
""" | |
transcript_digits = [ | |
( | |
'nine oh double eight three four four twenty eight twenty two five thousand two', | |
[9, 0, 8, 8, 3, 4, 4, 28, 22, 5000, 2] | |
), | |
( | |
'four one five eight double two five thousand', | |
[4, 1, 5, 8, 22, 5000] | |
), | |
( | |
'triple five eight for too two two thirty', | |
[555, 8, 4, 2, 2, 2, 30] | |
), | |
( | |
'four one five um eight one uh too for six hundred', | |
[4, 1, 5, 8, 1, 2, 4, 600] | |
) | |
] | |
if __name__ == '__main__': | |
transcript = sys.argv[1] | |
transcript = transcript.lower() | |
transcript = ''.join([c if c.isalpha() else ' ' for c in transcript]) | |
digits = extract_digits(transcript) | |
print ' '.join(map(str, digits)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment