-
-
Save madebyollin/508930c86fa12e1a70e32d91411485a8 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
""" | |
To use: | |
1. install/set-up the google cloud api and dependencies listed on https://github.com/GoogleCloudPlatform/python-docs-samples/tree/master/texttospeech/cloud-client | |
2. install pandoc and pypandoc, also tqdm | |
3. create and download a service_account.json ("Service account key") from https://console.cloud.google.com/apis/credentials | |
4. run GOOGLE_APPLICATION_CREDENTIALS=service_account.json python make_audiobook.py book_name.epub | |
""" | |
import re | |
import sys | |
import time | |
from datetime import datetime as dt | |
from pathlib import Path | |
from google.cloud import texttospeech | |
from tqdm import tqdm | |
import pypandoc | |
# see https://cloud.google.com/text-to-speech/quotas | |
MAX_REQUESTS_PER_MINUTE = 200 | |
MAX_CHARS_PER_MINUTE = 135000 | |
def book_to_text(book_file): | |
try: | |
return pypandoc.convert_file(book_file, "plain", extra_args=["--wrap=none"]) | |
except RuntimeError: | |
print("Format not recognized. Treating as plain text...") | |
with open(book_file, encoding="utf-8") as book: | |
return book.read() | |
def clean_text_chunk(text_chunk): | |
# remove _italics_ | |
text_chunk = re.sub(r"_", " ", text_chunk) | |
# remove --- hyphens for footnotes | |
text_chunk = re.sub(r"(\-{3,})", "Footnote:", text_chunk) | |
return text_chunk | |
class Narrator: | |
def __init__(self, voice_name="en-US-Wavenet-F"): | |
self.client = texttospeech.TextToSpeechClient() | |
self.voice = texttospeech.types.VoiceSelectionParams( | |
language_code="en-US", name=voice_name | |
) | |
self.audio_config = texttospeech.types.AudioConfig( | |
audio_encoding=texttospeech.enums.AudioEncoding.MP3 | |
) | |
# rate limit stuff | |
self._minute = -1 | |
self._requests_this_minute = 0 | |
self._chars_this_minute = 0 | |
def print_voice_names(self, lang="en"): | |
print("Available voices for language {}:".format(lang)) | |
for voice in self.client.list_voices().voices: | |
if voice.name.startswith(lang): | |
print(voice.name) | |
def _rate_limit(self): | |
if ( | |
self._requests_this_minute > MAX_REQUESTS_PER_MINUTE | |
or self._chars_this_minute > MAX_CHARS_PER_MINUTE | |
): | |
while dt.now().minute == self._minute: | |
time.sleep(5) | |
if dt.now().minute != self._minute: | |
self._minute = dt.now().minute | |
self._requests_this_minute = 0 | |
self._chars_this_minute = 0 | |
def _text_chunk_to_audio_chunk(self, text_chunk): | |
self._rate_limit() | |
input_text = texttospeech.types.SynthesisInput(text=text_chunk) | |
response = self.client.synthesize_speech( | |
input_text, self.voice, self.audio_config | |
) | |
self._requests_this_minute += 1 | |
self._chars_this_minute += len(text_chunk) | |
return response.audio_content | |
def text_to_mp3(self, text, file_dest): | |
assert file_dest.suffix == ".mp3" | |
lines = text.splitlines() | |
with file_dest.open("wb") as out: | |
for i, text_chunk in enumerate(tqdm(lines, desc=file_dest.stem)): | |
# skip empty lines | |
if text_chunk: | |
text_chunk = clean_text_chunk(text_chunk) | |
audio_chunk = self._text_chunk_to_audio_chunk(text_chunk) | |
# this is fine because mp3s can be concatenated naively and still work | |
out.write(audio_chunk) | |
def main(): | |
if not sys.argv[1:]: | |
print( | |
"Usage: GOOGLE_APPLICATION_CREDENTIALS=service_account.json {} book_name.epub".format( | |
sys.argv[0] | |
) | |
) | |
sys.exit(1) | |
narrator = Narrator() | |
# narrator.print_voice_names() | |
for book_file in sys.argv[1:]: | |
text = book_to_text(book_file) | |
mp3_path = Path(book_file).with_suffix(".mp3") | |
narrator.text_to_mp3(text, mp3_path) | |
print("Generated mp3", mp3_path) | |
# I have another script that uploads to overcast... | |
# import subprocess as sp | |
# sp.call("upload.py '" + str(mp3_path) + "'", shell=True) | |
if __name__ == "__main__": | |
main() |
Yeah, mp3 concatenation via cat
is a hack that happens to work for the mp3 format. You should use sox
or ffmpeg
to do proper concatenation. And the os.path
stuff should all be using pathlib
instead; I'm not sure about the state of temp directories on windows so probably that needs to be fixed too.
Amazing script, does it work with pdf? It would be great to turn this into a full blown project!
It won't work with a PDF out of the box (since pandoc does not import PDFs). But you can run pdftotext to extract the text first and then run this script on the extracted text. In general, extracting comprehensible text from a PDF is a nightmare, but for simple documents it should work.
Ok, thanks for the info and the script, I appreciate it! 🙏🏻
The script is awesome and works well! I want to modify the pitch in voice. Can you suggest me some code for that?
@madebyollin
The script is awesome and works well! I want to modify the pitch in voice. Can you suggest me some code for that?
@madebyollin
you can add pitch / rate parameters to the audio config (line 48 of the script) as per their documentation, e.g.
texttospeech.types.AudioConfig(
audio_encoding=texttospeech.enums.AudioEncoding.MP3,
pitch=-3,
speaking_rate=0.8
)
Damn this is awesome. Thank you. I think I'm gonna try to turn this into a small personal project. I spent the last hour throwing code at the wall to see what sticks, and it already can create opus albums (1 track / chapter).
Nice script! Though if we split the file based on lines, the audio will get jagged. I have made some changes like splitting the file based on "." (periods)
. If anyone wants to check out my version (based on this script), they can head over to this repo and any improvements or suggestions are welcome too.
Using
OGG_OPUS
as audio encoding yields significantly better audio quality but messes with duration/seeking, don't know why.EDIT: that must be by the fact that "mp3s can be concatenated naively and still work" but ogg files can't... maybe using ffmpeg? We can also produce tiny tracks for every
text_chunk
and append them all in the end.Also, I had to change
os.path.join("/tmp/"
toos.path.join("tmp/"
on Windows, and you should put"encoding="UTF8"
when treating the files as plain text (I wish we could open pull requests on gists)