Last active
February 2, 2024 18:09
-
-
Save cuckookernel/d7ccf214a000c025b723554cbb5fac59 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python | |
""" | |
Turn a long text into a sound file, using Coqui TTS: text-to-speech | |
Example usage: | |
./doc_2_audio.py -e utf16 my_dir/my_document.txt | |
Will produce output my_dir/my_document.wav | |
For other options: | |
./doc_2_audio.py -h | |
Requirements: | |
pip install TTS # install coqui-tts library | |
It could be a good idea to create venv first (before pip install): | |
python3 -m venv venv-d2a; venv-d2a/bin/activate.sh; pip install wheel | |
TODO: | |
- Automatically extract plain text from a pdf document. | |
- Generate mp3 instead of wav | |
""" | |
import re | |
import argparse | |
from pathlib import Path | |
import torch | |
from TTS.api import TTS | |
TTS_MODELS_BY_KEY = { | |
'vits': 'tts_models/en/vctk/vits' | |
} | |
# %% | |
def main(): | |
args = get_cli_args() | |
run(in_txt_file=Path(args.in_txt_file), | |
encoding=args.encoding, | |
tts_model_key=args.tts_model_key, | |
speaker=args.speaker) | |
def get_cli_args() -> argparse.Namespace: | |
arg_parser = argparse.ArgumentParser(description=__doc__) | |
arg_parser.add_argument('in_txt_file', help='input text file') | |
arg_parser.add_argument('-e', '--encoding', help="input text file's encoding", | |
default="utf8") | |
arg_parser.add_argument('-k', '--tts_model_key', | |
help=f'tts model key, possible values: ' | |
f'{list(TTS_MODELS_BY_KEY.keys())}', | |
default="vits") | |
arg_parser.add_argument('-s', '--speaker', | |
help=f'speaker used in the case of a multispeaker model', | |
default="p225") | |
return arg_parser.parse_args() | |
def run(*, in_txt_file: Path, encoding: str, tts_model_key: str, | |
speaker: str) -> None: | |
# Get device | |
tts_model_name = TTS_MODELS_BY_KEY[tts_model_key] | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
tts = TTS(tts_model_name).to(device) | |
sections = read_text_file(in_txt_file, encoding=encoding) | |
text = "\n".join(sections) | |
out_wav_file = in_txt_file.with_suffix(".wav") | |
tts.tts_to_file(text=text, speaker=speaker, file_path=out_wav_file) | |
print(f"Done generating wav file: {out_wav_file}") | |
# %% | |
def read_text_file(in_txt_file: Path, encoding) -> list[str]: | |
"""Return one string for each section""" | |
whole_text = in_txt_file.read_text(encoding=encoding) | |
sections = re.split("\n\n", whole_text, flags=re.MULTILINE) | |
print(f"{len(sections)} sections found:") | |
for i, section_raw in enumerate(sections): | |
section = clean_section(section_raw) | |
print(f" {i}: {len(section)} : {section[:16]} .. {section[-16:]}") | |
return sections | |
# %% | |
def clean_section(section: str) -> str: | |
lines = section.split("\n") | |
new_lines = [line.strip().strip('"') for line in lines] | |
return " ".join(new_lines) | |
# %% | |
def _interactive_testing(): | |
# %% | |
in_txt_file = Path("/home/teo/gdrive_rclone/EBooks/Machine Learning & Statistics/" | |
"Incorporating_Ethics_into_Artificial_Intelligence_.txt") | |
encoding = "utf8" | |
run(in_txt_file=in_txt_file, encoding=encoding, | |
tts_model_key="vits", speaker="p225") | |
# %% | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment