pathikrit · April 22, 2025 18:53
diff --git a/aws_tts.py b/aws_tts.py
 import boto3
 import json
 import base64

 def speak(text: str, voice: str = "Joanna"):
    polly_client = boto3.client('polly')

    def synthensize_speech(speech_marks: bool):
        response = polly_client.synthesize_speech(
            VoiceId=voice,
            OutputFormat='json' if speech_marks else 'mp3',
            Text=text,
            Engine='generative',
            SpeechMarkTypes=['word'] if speech_marks else []
        )
        if speech_marks:
            return [json.loads(line.decode('utf-8')) for line in response['AudioStream'].iter_lines() if line]
        else:
            audio_bytes = response['AudioStream'].read()
            return f"data:audio/mpeg;base64,{base64.b64encode(audio_bytes).decode('utf-8')}"

    return {
        "speech_marks": synthensize_speech(speech_marks=True),
        "b64_audio_uri": synthensize_speech(speech_marks=False)
    }
diff --git a/google_tts.py b/google_tts.py
 from google.cloud import texttospeech_v1beta1 as tts
 from google.api_core.client_options import ClientOptions

 def speak(
    text: str,
    chirp3_voice: str = "Zephyr",
    speed: float = 0.8,
    lang: str = "en-US"
 ):
    """
    :param chirp3_voice: https://cloud.google.com/text-to-speech/docs/chirp3-hd
    """
    client = tts.TextToSpeechClient(client_options=ClientOptions(api_key=os.getenv("GEMINI_API_KEY")))
    request=tts.SynthesizeSpeechRequest(
        input=tts.SynthesisInput(text=text),
        voice=tts.VoiceSelectionParams(language_code=lang, name="-".join([lang, "Chirp3", "HD", chirp3_voice])),
        audio_config=tts.AudioConfig(audio_encoding=tts.AudioEncoding.MP3, speaking_rate=speed),
        enable_time_pointing=[tts.SynthesizeSpeechRequest.TimepointType.SSML_MARK],
    )
    response = client.synthesize_speech(request)

    return {
        "speech_marks": [{"mark_name": tp.mark_name, "time_seconds": tp.time_seconds} for tp in response.timepoints],
        "b64_audio_uri": f"data:audio/mpeg;base64,{base64.b64encode(response.audio_content).decode('utf-8')}"
    }
	import boto3
	import json
	import base64

	def speak(text: str, voice: str = "Joanna"):
	polly_client = boto3.client('polly')

	def synthensize_speech(speech_marks: bool):
	response = polly_client.synthesize_speech(
	VoiceId=voice,
	OutputFormat='json' if speech_marks else 'mp3',
	Text=text,
	Engine='generative',
	SpeechMarkTypes=['word'] if speech_marks else []
	)
	if speech_marks:
	return [json.loads(line.decode('utf-8')) for line in response['AudioStream'].iter_lines() if line]
	else:
	audio_bytes = response['AudioStream'].read()
	return f"data:audio/mpeg;base64,{base64.b64encode(audio_bytes).decode('utf-8')}"

	return {
	"speech_marks": synthensize_speech(speech_marks=True),
	"b64_audio_uri": synthensize_speech(speech_marks=False)
	}
	from google.cloud import texttospeech_v1beta1 as tts
	from google.api_core.client_options import ClientOptions

	def speak(
	text: str,
	chirp3_voice: str = "Zephyr",
	speed: float = 0.8,
	lang: str = "en-US"
	):
	"""
	:param chirp3_voice: https://cloud.google.com/text-to-speech/docs/chirp3-hd
	"""
	client = tts.TextToSpeechClient(client_options=ClientOptions(api_key=os.getenv("GEMINI_API_KEY")))
	request=tts.SynthesizeSpeechRequest(
	input=tts.SynthesisInput(text=text),
	voice=tts.VoiceSelectionParams(language_code=lang, name="-".join([lang, "Chirp3", "HD", chirp3_voice])),
	audio_config=tts.AudioConfig(audio_encoding=tts.AudioEncoding.MP3, speaking_rate=speed),
	enable_time_pointing=[tts.SynthesizeSpeechRequest.TimepointType.SSML_MARK],
	)
	response = client.synthesize_speech(request)

	return {
	"speech_marks": [{"mark_name": tp.mark_name, "time_seconds": tp.time_seconds} for tp in response.timepoints],
	"b64_audio_uri": f"data:audio/mpeg;base64,{base64.b64encode(response.audio_content).decode('utf-8')}"
	}