Skip to content

Instantly share code, notes, and snippets.

Created June 14, 2014 22:00
Show Gist options
  • Save FLamparski/a3a2e4671eb6bfdbb97f to your computer and use it in GitHub Desktop.
Save FLamparski/a3a2e4671eb6bfdbb97f to your computer and use it in GitHub Desktop.
Prepare a bunch of text for being read out loud by Festival TTS.
#!/usr/bin/env python3
import sys, re, os
from itertools import chain
# Maps various currency signs to their names.
# TODO: collect keys from this to build the currency regex.
CURRENCY_MAP = {'$': 'dollars', '£': 'pounds', '€': 'euro'}
# Maps certain suffixes with to full word equivalents
EXPONENT_MAP = {'b': 'billion', 'm': 'million', 'k': 'thousand', 't': 'trillion'}
# Maps some 'fancy' characters to their mundane equivalents
CHARSUBS_MAP = {'’': "'" # Curly apostrophe to ASCII
,'—': ' - '} # em-dash to a minus sign/dash
def replace_banned(line):
""" Here, banned means characters which cause Festival to
go nuts and spell individual letters of phrases. """
for key in CHARSUBS_MAP:
line = line.replace(key, CHARSUBS_MAP[key])
return line
def process_match(match):
""" Converts expressions such as $17bn to their 'spoken'' equivalents,
like "17 billion dollars".
The return tuple is a map of the original expression to its converted form,
and should be used in the str.replace(str, str) function. """
gd = match.groupdict()
notation = str(
processed = "".join([gd['intval'], " ", EXPONENT_MAP[gd['exponent'][0:1].lower()], " ", CURRENCY_MAP[gd['currency']]])
return (notation, processed)
def process_hash(match):
""" Converts things that begin with a # (hash). The rule here is that
if the match object defines a hashtag group, then it's a hash tag,
otherwise it's a number. """
gd = match.groupdict()
notation = str(
if gd['intval'] != None and gd['hashtag'] == None:
return (notation, "".join(["number ", str(gd['intval'])]))
elif gd['hashtag'] != None and gd['intval'] == None:
return (notation, "".join(["hash tag ", str(gd['hashtag'])]))
return (notation, gd['hashtag'])
def process_line_source(line_source, line_sink):
currency_regex = re.compile("(?P<currency>[\$£€])(?P<intval>\d+) ?(?P<exponent>(k|K|m|M|b|B|bn|tn))?")
hash_regex = re.compile("#((?P<intval>\d+)|(?P<hashtag>[a-zA-Z]*))")
for line in line_source:
# This uses generator expressions.
currency_matches = (match for match in currency_regex.finditer(line))
hash_matches = (match for match in hash_regex.finditer(line))
processed_currencies = (process_match(match) for match in currency_matches)
processed_hashes = (process_hash(match) for match in hash_matches)
processed_matches = chain(processed_currencies, processed_hashes)
for pm in processed_matches:
#print("({} => {})".format(pm[0], pm[1]), file=sys.stderr)
line = line.replace(pm[0], pm[1])
print(replace_banned(line), file=line_sink, end='')
def __test__():
# 35.9 usec per loop with generators; 129 with ttsin file
# 34.2 usec per loop with lists; 128 with ttsin file
class _devnull:
""" Dummy text sink """
def write(*a):
with open('ttsin', 'r') as f:
# ttsin is the input file for my tts toolchain
process_line_source(f, _devnull())
if __name__ == "__main__":
process_line_source(sys.stdin, sys.stdout)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment