Created
June 14, 2014 22:00
-
-
Save FLamparski/a3a2e4671eb6bfdbb97f to your computer and use it in GitHub Desktop.
Prepare a bunch of text for being read out loud by Festival TTS.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys, re, os | |
from itertools import chain | |
# Maps various currency signs to their names. | |
# TODO: collect keys from this to build the currency regex. | |
CURRENCY_MAP = {'$': 'dollars', '£': 'pounds', '€': 'euro'} | |
# Maps certain suffixes with to full word equivalents | |
EXPONENT_MAP = {'b': 'billion', 'm': 'million', 'k': 'thousand', 't': 'trillion'} | |
# Maps some 'fancy' characters to their mundane equivalents | |
CHARSUBS_MAP = {'’': "'" # Curly apostrophe to ASCII | |
,'—': ' - '} # em-dash to a minus sign/dash | |
def replace_banned(line): | |
""" Here, banned means characters which cause Festival to | |
go nuts and spell individual letters of phrases. """ | |
for key in CHARSUBS_MAP: | |
line = line.replace(key, CHARSUBS_MAP[key]) | |
return line | |
def process_match(match): | |
""" Converts expressions such as $17bn to their 'spoken'' equivalents, | |
like "17 billion dollars". | |
The return tuple is a map of the original expression to its converted form, | |
and should be used in the str.replace(str, str) function. """ | |
gd = match.groupdict() | |
notation = str(match.group()) | |
processed = "".join([gd['intval'], " ", EXPONENT_MAP[gd['exponent'][0:1].lower()], " ", CURRENCY_MAP[gd['currency']]]) | |
return (notation, processed) | |
def process_hash(match): | |
""" Converts things that begin with a # (hash). The rule here is that | |
if the match object defines a hashtag group, then it's a hash tag, | |
otherwise it's a number. """ | |
gd = match.groupdict() | |
notation = str(match.group()) | |
if gd['intval'] != None and gd['hashtag'] == None: | |
return (notation, "".join(["number ", str(gd['intval'])])) | |
elif gd['hashtag'] != None and gd['intval'] == None: | |
return (notation, "".join(["hash tag ", str(gd['hashtag'])])) | |
else: | |
return (notation, gd['hashtag']) | |
def process_line_source(line_source, line_sink): | |
currency_regex = re.compile("(?P<currency>[\$£€])(?P<intval>\d+) ?(?P<exponent>(k|K|m|M|b|B|bn|tn))?") | |
hash_regex = re.compile("#((?P<intval>\d+)|(?P<hashtag>[a-zA-Z]*))") | |
for line in line_source: | |
# This uses generator expressions. | |
currency_matches = (match for match in currency_regex.finditer(line)) | |
hash_matches = (match for match in hash_regex.finditer(line)) | |
processed_currencies = (process_match(match) for match in currency_matches) | |
processed_hashes = (process_hash(match) for match in hash_matches) | |
processed_matches = chain(processed_currencies, processed_hashes) | |
for pm in processed_matches: | |
#print("({} => {})".format(pm[0], pm[1]), file=sys.stderr) | |
line = line.replace(pm[0], pm[1]) | |
print(replace_banned(line), file=line_sink, end='') | |
def __test__(): | |
# 35.9 usec per loop with generators; 129 with ttsin file | |
# 34.2 usec per loop with lists; 128 with ttsin file | |
class _devnull: | |
""" Dummy text sink """ | |
def write(*a): | |
pass | |
with open('ttsin', 'r') as f: | |
# ttsin is the input file for my tts toolchain | |
process_line_source(f, _devnull()) | |
if __name__ == "__main__": | |
process_line_source(sys.stdin, sys.stdout) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment