esnya · April 22, 2023 05:50
diff --git a/tts_patch.py b/tts_patch.py
 # Original code: Apache License 2.0
 #   https://github.com/espnet/espnet/blob/master/LICENSE
 # Modified by: esnya
 #   https://github.com/esnya

 from espnet2.bin.tts_inference import Text2Speech
 from espnet2.text.phoneme_tokenizer import (
    pyopenjtalk_g2p_accent_with_pause,
    pyopenjtalk_g2p_prosody,
 )


 def pyopenjtalk_g2p_accent_with_pause_patch(text) -> list[str]:
    import re

    import pyopenjtalk

    phones = []
    for labels in pyopenjtalk.make_label(pyopenjtalk.run_frontend(text)):
        if labels.split("-")[1].split("+")[0] == "pau":
            phones += ["pau"]
            continue
        p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9]+)", labels)
        if len(p) == 1:
            phones += [p[0][0], p[0][2], p[0][1]]
    return phones


 def pyopenjtalk_g2p_prosody_patch(
    text: str, drop_unvoiced_vowels: bool = True
 ) -> list[str]:
    """Extract phoneme + prosoody symbol sequence from input full-context labels.

    The algorithm is based on `Prosodic features control by symbols as input of
    sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.

    Args:
        text (str): Input text.
        drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.

    Returns:
        List[str]: List of phoneme + prosody symbols.

    Examples:
        >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
        >>> pyopenjtalk_g2p_prosody("こんにちは。")
        ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']

    .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
        modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104

    """
    import re

    import pyopenjtalk
    from espnet2.text.phoneme_tokenizer import _numeric_feature_by_regex

    labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
    N = len(labels)

    phones = []
    for n in range(N):
        lab_curr = labels[n]

        # current phoneme
        p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)  # type: ignore

        # deal unvoiced vowels as normal vowels
        if drop_unvoiced_vowels and p3 in "AEIOU":
            p3 = p3.lower()

        # deal with sil at the beginning and the end of text
        if p3 == "sil":
            assert n == 0 or n == N - 1
            if n == 0:
                phones.append("^")
            elif n == N - 1:
                # check question form or not
                e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
                if e3 == 0:
                    phones.append("$")
                elif e3 == 1:
                    phones.append("?")
            continue
        elif p3 == "pau":
            phones.append("_")
            continue
        else:
            phones.append(p3)

        # accent type and position info (forward or backward)
        a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
        a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
        a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)

        # number of mora in accent phrase
        f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)

        a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
        # accent phrase border
        if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
            phones.append("#")
        # pitch falling
        elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
            phones.append("]")
        # pitch rising
        elif a2 == 1 and a2_next == 2:
            phones.append("[")

    return phones


 def patch_tts(tts: Text2Speech) -> Text2Speech:
    if tts.preprocess_fn.tokenizer.g2p == pyopenjtalk_g2p_prosody:  # type: ignore[attr-defined]
        tts.preprocess_fn.tokenizer.g2p = pyopenjtalk_g2p_prosody_patch  # type: ignore[attr-defined]
    elif tts.preprocess_fn.tokenizer.g2p == pyopenjtalk_g2p_accent_with_pause:  # type: ignore[attr-defined]
        tts.preprocess_fn.tokenizer.g2p = pyopenjtalk_g2p_accent_with_pause_patch  # type: ignore[attr-defined]
    return tts
	# Original code: Apache License 2.0
	# https://github.com/espnet/espnet/blob/master/LICENSE
	# Modified by: esnya
	# https://github.com/esnya

	from espnet2.bin.tts_inference import Text2Speech
	from espnet2.text.phoneme_tokenizer import (
	pyopenjtalk_g2p_accent_with_pause,
	pyopenjtalk_g2p_prosody,
	)


	def pyopenjtalk_g2p_accent_with_pause_patch(text) -> list[str]:
	import re

	import pyopenjtalk

	phones = []
	for labels in pyopenjtalk.make_label(pyopenjtalk.run_frontend(text)):
	if labels.split("-")[1].split("+")[0] == "pau":
	phones += ["pau"]
	continue
	p = re.findall(r"\-(.?)\+.?\/A:([0-9\-]+).?\/F:.?_([0-9]+)", labels)
	if len(p) == 1:
	phones += [p[0][0], p[0][2], p[0][1]]
	return phones


	def pyopenjtalk_g2p_prosody_patch(
	text: str, drop_unvoiced_vowels: bool = True
	) -> list[str]:
	"""Extract phoneme + prosoody symbol sequence from input full-context labels.

	The algorithm is based on `Prosodic features control by symbols as input of
	sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.

	Args:
	text (str): Input text.
	drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.

	Returns:
	List[str]: List of phoneme + prosody symbols.

	Examples:
	>>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
	>>> pyopenjtalk_g2p_prosody("こんにちは。")
	['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']

	.. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
	modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104

	"""
	import re

	import pyopenjtalk
	from espnet2.text.phoneme_tokenizer import _numeric_feature_by_regex

	labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
	N = len(labels)

	phones = []
	for n in range(N):
	lab_curr = labels[n]

	# current phoneme
	p3 = re.search(r"\-(.*?)\+", lab_curr).group(1) # type: ignore

	# deal unvoiced vowels as normal vowels
	if drop_unvoiced_vowels and p3 in "AEIOU":
	p3 = p3.lower()

	# deal with sil at the beginning and the end of text
	if p3 == "sil":
	assert n == 0 or n == N - 1
	if n == 0:
	phones.append("^")
	elif n == N - 1:
	# check question form or not
	e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
	if e3 == 0:
	phones.append("$")
	elif e3 == 1:
	phones.append("?")
	continue
	elif p3 == "pau":
	phones.append("_")
	continue
	else:
	phones.append(p3)

	# accent type and position info (forward or backward)
	a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
	a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
	a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)

	# number of mora in accent phrase
	f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)

	a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
	# accent phrase border
	if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
	phones.append("#")
	# pitch falling
	elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
	phones.append("]")
	# pitch rising
	elif a2 == 1 and a2_next == 2:
	phones.append("[")

	return phones


	def patch_tts(tts: Text2Speech) -> Text2Speech:
	if tts.preprocess_fn.tokenizer.g2p == pyopenjtalk_g2p_prosody: # type: ignore[attr-defined]
	tts.preprocess_fn.tokenizer.g2p = pyopenjtalk_g2p_prosody_patch # type: ignore[attr-defined]
	elif tts.preprocess_fn.tokenizer.g2p == pyopenjtalk_g2p_accent_with_pause: # type: ignore[attr-defined]
	tts.preprocess_fn.tokenizer.g2p = pyopenjtalk_g2p_accent_with_pause_patch # type: ignore[attr-defined]
	return tts