Created
April 22, 2023 05:50
-
-
Save esnya/76661d9dfb1a6dea41c099d435865844 to your computer and use it in GitHub Desktop.
Monkey-Patch for espnet2.bin.tts_inference.Text2Speech to support pyopenjtalk 0.3 and python 3.10
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Original code: Apache License 2.0 | |
# https://github.com/espnet/espnet/blob/master/LICENSE | |
# Modified by: esnya | |
# https://github.com/esnya | |
from espnet2.bin.tts_inference import Text2Speech | |
from espnet2.text.phoneme_tokenizer import ( | |
pyopenjtalk_g2p_accent_with_pause, | |
pyopenjtalk_g2p_prosody, | |
) | |
def pyopenjtalk_g2p_accent_with_pause_patch(text) -> list[str]: | |
import re | |
import pyopenjtalk | |
phones = [] | |
for labels in pyopenjtalk.make_label(pyopenjtalk.run_frontend(text)): | |
if labels.split("-")[1].split("+")[0] == "pau": | |
phones += ["pau"] | |
continue | |
p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9]+)", labels) | |
if len(p) == 1: | |
phones += [p[0][0], p[0][2], p[0][1]] | |
return phones | |
def pyopenjtalk_g2p_prosody_patch( | |
text: str, drop_unvoiced_vowels: bool = True | |
) -> list[str]: | |
"""Extract phoneme + prosoody symbol sequence from input full-context labels. | |
The algorithm is based on `Prosodic features control by symbols as input of | |
sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks. | |
Args: | |
text (str): Input text. | |
drop_unvoiced_vowels (bool): whether to drop unvoiced vowels. | |
Returns: | |
List[str]: List of phoneme + prosody symbols. | |
Examples: | |
>>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody | |
>>> pyopenjtalk_g2p_prosody("こんにちは。") | |
['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$'] | |
.. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic | |
modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104 | |
""" | |
import re | |
import pyopenjtalk | |
from espnet2.text.phoneme_tokenizer import _numeric_feature_by_regex | |
labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text)) | |
N = len(labels) | |
phones = [] | |
for n in range(N): | |
lab_curr = labels[n] | |
# current phoneme | |
p3 = re.search(r"\-(.*?)\+", lab_curr).group(1) # type: ignore | |
# deal unvoiced vowels as normal vowels | |
if drop_unvoiced_vowels and p3 in "AEIOU": | |
p3 = p3.lower() | |
# deal with sil at the beginning and the end of text | |
if p3 == "sil": | |
assert n == 0 or n == N - 1 | |
if n == 0: | |
phones.append("^") | |
elif n == N - 1: | |
# check question form or not | |
e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr) | |
if e3 == 0: | |
phones.append("$") | |
elif e3 == 1: | |
phones.append("?") | |
continue | |
elif p3 == "pau": | |
phones.append("_") | |
continue | |
else: | |
phones.append(p3) | |
# accent type and position info (forward or backward) | |
a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr) | |
a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr) | |
a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr) | |
# number of mora in accent phrase | |
f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr) | |
a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1]) | |
# accent phrase border | |
if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl": | |
phones.append("#") | |
# pitch falling | |
elif a1 == 0 and a2_next == a2 + 1 and a2 != f1: | |
phones.append("]") | |
# pitch rising | |
elif a2 == 1 and a2_next == 2: | |
phones.append("[") | |
return phones | |
def patch_tts(tts: Text2Speech) -> Text2Speech: | |
if tts.preprocess_fn.tokenizer.g2p == pyopenjtalk_g2p_prosody: # type: ignore[attr-defined] | |
tts.preprocess_fn.tokenizer.g2p = pyopenjtalk_g2p_prosody_patch # type: ignore[attr-defined] | |
elif tts.preprocess_fn.tokenizer.g2p == pyopenjtalk_g2p_accent_with_pause: # type: ignore[attr-defined] | |
tts.preprocess_fn.tokenizer.g2p = pyopenjtalk_g2p_accent_with_pause_patch # type: ignore[attr-defined] | |
return tts |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment