madebyollin · June 6, 2025 07:24 · JafarAbbas33 · Jan 19, 2023
diff --git a/make_audiobook.py b/make_audiobook.py
 #!/usr/bin/env python3
 """
 To use:
 1. install/set-up the google cloud api and dependencies listed on https://github.com/GoogleCloudPlatform/python-docs-samples/tree/master/texttospeech/cloud-client
 2. install pandoc and pypandoc, also tqdm
 3. create and download a service_account.json ("Service account key") from https://console.cloud.google.com/apis/credentials
 4. run GOOGLE_APPLICATION_CREDENTIALS=service_account.json python make_audiobook.py book_name.epub
 """
 import re
 import sys
 import time
 from datetime import datetime as dt
 from pathlib import Path

 from google.cloud import texttospeech
 from tqdm import tqdm

 import pypandoc

 # see https://cloud.google.com/text-to-speech/quotas
 MAX_REQUESTS_PER_MINUTE = 200
 MAX_CHARS_PER_MINUTE = 135000


 def book_to_text(book_file):
    try:
        return pypandoc.convert_file(book_file, "plain", extra_args=["--wrap=none"])
    except RuntimeError:
        print("Format not recognized. Treating as plain text...")
        with open(book_file, encoding="utf-8") as book:
            return book.read()


 def clean_text_chunk(text_chunk):
    # remove _italics_
    text_chunk = re.sub(r"_", " ", text_chunk)
    # remove --- hyphens for footnotes
    text_chunk = re.sub(r"(\-{3,})", "Footnote:", text_chunk)
    return text_chunk


 class Narrator:
    def __init__(self, voice_name="en-US-Wavenet-F"):
        self.client = texttospeech.TextToSpeechClient()
        self.voice = texttospeech.types.VoiceSelectionParams(
            language_code="en-US", name=voice_name
        )
        self.audio_config = texttospeech.types.AudioConfig(
            audio_encoding=texttospeech.enums.AudioEncoding.MP3
        )
        # rate limit stuff
        self._minute = -1
        self._requests_this_minute = 0
        self._chars_this_minute = 0

    def print_voice_names(self, lang="en"):
        print("Available voices for language {}:".format(lang))
        for voice in self.client.list_voices().voices:
            if voice.name.startswith(lang):
                print(voice.name)

    def _rate_limit(self):
        if (
            self._requests_this_minute > MAX_REQUESTS_PER_MINUTE
            or self._chars_this_minute > MAX_CHARS_PER_MINUTE
        ):
            while dt.now().minute == self._minute:
                time.sleep(5)
        if dt.now().minute != self._minute:
            self._minute = dt.now().minute
            self._requests_this_minute = 0
            self._chars_this_minute = 0

    def _text_chunk_to_audio_chunk(self, text_chunk):
        self._rate_limit()
        input_text = texttospeech.types.SynthesisInput(text=text_chunk)
        response = self.client.synthesize_speech(
            input_text, self.voice, self.audio_config
        )
        self._requests_this_minute += 1
        self._chars_this_minute += len(text_chunk)
        return response.audio_content

    def text_to_mp3(self, text, file_dest):
        assert file_dest.suffix == ".mp3"
        lines = text.splitlines()
        with file_dest.open("wb") as out:
            for i, text_chunk in enumerate(tqdm(lines, desc=file_dest.stem)):
                # skip empty lines
                if text_chunk:
                    text_chunk = clean_text_chunk(text_chunk)
                    audio_chunk = self._text_chunk_to_audio_chunk(text_chunk)
                    # this is fine because mp3s can be concatenated naively and still work
                    out.write(audio_chunk)


 def main():
    if not sys.argv[1:]:
        print(
            "Usage: GOOGLE_APPLICATION_CREDENTIALS=service_account.json {} book_name.epub".format(
                sys.argv[0]
            )
        )
        sys.exit(1)
    narrator = Narrator()

    # narrator.print_voice_names()

    for book_file in sys.argv[1:]:
        text = book_to_text(book_file)
        mp3_path = Path(book_file).with_suffix(".mp3")
        narrator.text_to_mp3(text, mp3_path)
        print("Generated mp3", mp3_path)

        # I have another script that uploads to overcast...
        # import subprocess as sp
        # sp.call("upload.py '" + str(mp3_path) + "'", shell=True)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	To use:
	1. install/set-up the google cloud api and dependencies listed on https://github.com/GoogleCloudPlatform/python-docs-samples/tree/master/texttospeech/cloud-client
	2. install pandoc and pypandoc, also tqdm
	3. create and download a service_account.json ("Service account key") from https://console.cloud.google.com/apis/credentials
	4. run GOOGLE_APPLICATION_CREDENTIALS=service_account.json python make_audiobook.py book_name.epub
	"""
	import re
	import sys
	import time
	from datetime import datetime as dt
	from pathlib import Path

	from google.cloud import texttospeech
	from tqdm import tqdm

	import pypandoc

	# see https://cloud.google.com/text-to-speech/quotas
	MAX_REQUESTS_PER_MINUTE = 200
	MAX_CHARS_PER_MINUTE = 135000


	def book_to_text(book_file):
	try:
	return pypandoc.convert_file(book_file, "plain", extra_args=["--wrap=none"])
	except RuntimeError:
	print("Format not recognized. Treating as plain text...")
	with open(book_file, encoding="utf-8") as book:
	return book.read()


	def clean_text_chunk(text_chunk):
	# remove _italics_
	text_chunk = re.sub(r"_", " ", text_chunk)
	# remove --- hyphens for footnotes
	text_chunk = re.sub(r"(\-{3,})", "Footnote:", text_chunk)
	return text_chunk


	class Narrator:
	def __init__(self, voice_name="en-US-Wavenet-F"):
	self.client = texttospeech.TextToSpeechClient()
	self.voice = texttospeech.types.VoiceSelectionParams(
	language_code="en-US", name=voice_name
	)
	self.audio_config = texttospeech.types.AudioConfig(
	audio_encoding=texttospeech.enums.AudioEncoding.MP3
	)
	# rate limit stuff
	self._minute = -1
	self._requests_this_minute = 0
	self._chars_this_minute = 0

	def print_voice_names(self, lang="en"):
	print("Available voices for language {}:".format(lang))
	for voice in self.client.list_voices().voices:
	if voice.name.startswith(lang):
	print(voice.name)

	def _rate_limit(self):
	if (
	self._requests_this_minute > MAX_REQUESTS_PER_MINUTE
	or self._chars_this_minute > MAX_CHARS_PER_MINUTE
	):
	while dt.now().minute == self._minute:
	time.sleep(5)
	if dt.now().minute != self._minute:
	self._minute = dt.now().minute
	self._requests_this_minute = 0
	self._chars_this_minute = 0

	def _text_chunk_to_audio_chunk(self, text_chunk):
	self._rate_limit()
	input_text = texttospeech.types.SynthesisInput(text=text_chunk)
	response = self.client.synthesize_speech(
	input_text, self.voice, self.audio_config
	)
	self._requests_this_minute += 1
	self._chars_this_minute += len(text_chunk)
	return response.audio_content

	def text_to_mp3(self, text, file_dest):
	assert file_dest.suffix == ".mp3"
	lines = text.splitlines()
	with file_dest.open("wb") as out:
	for i, text_chunk in enumerate(tqdm(lines, desc=file_dest.stem)):
	# skip empty lines
	if text_chunk:
	text_chunk = clean_text_chunk(text_chunk)
	audio_chunk = self._text_chunk_to_audio_chunk(text_chunk)
	# this is fine because mp3s can be concatenated naively and still work
	out.write(audio_chunk)


	def main():
	if not sys.argv[1:]:
	print(
	"Usage: GOOGLE_APPLICATION_CREDENTIALS=service_account.json {} book_name.epub".format(
	sys.argv[0]
	)
	)
	sys.exit(1)
	narrator = Narrator()

	# narrator.print_voice_names()

	for book_file in sys.argv[1:]:
	text = book_to_text(book_file)
	mp3_path = Path(book_file).with_suffix(".mp3")
	narrator.text_to_mp3(text, mp3_path)
	print("Generated mp3", mp3_path)

	# I have another script that uploads to overcast...
	# import subprocess as sp
	# sp.call("upload.py '" + str(mp3_path) + "'", shell=True)


	if __name__ == "__main__":
	main()