Skip to content

Instantly share code, notes, and snippets.

@the-code-rider
Created November 2, 2025 13:55
Show Gist options
  • Select an option

  • Save the-code-rider/467391ec41a27a4da1821814227e79a9 to your computer and use it in GitHub Desktop.

Select an option

Save the-code-rider/467391ec41a27a4da1821814227e79a9 to your computer and use it in GitHub Desktop.
cartesia tts script
import os
import requests
from dotenv import load_dotenv
import numpy as np
import soundfile as sf
# Load environment variables from .env file
load_dotenv()
# Base URL for Cartesia API
CARTESIA_TTS_URL = "https://api.cartesia.ai/tts/bytes"
def get_headers():
"""Return headers for Cartesia API."""
return {
"Cartesia-Version": os.getenv("CARTESIA_VERSION", "2024-01-01"),
"Authorization": f"Bearer {os.getenv('CARTESIA_API_TOKEN')}",
"Content-Type": "application/json"
}
def build_payload(model_id: str, transcript: str, voice_id: str, mode: str = "id"):
"""Build the JSON payload for TTS request."""
return {
"model_id": model_id,
"transcript": transcript,
"voice": {
"mode": mode,
"id": voice_id
},
"language": "en",
"generation_config": {
"volume": 1,
"speed": 1,
"emotion": "neutral"
},
"output_format": {
"container": "raw",
"encoding": "pcm_f32le",
"sample_rate": 8000
},
"save": False,
"speed": "normal"
}
def synthesize_speech(model_id: str, transcript: str, voice_id: str, mode: str = "id"):
"""Call Cartesia API and return the response."""
headers = get_headers()
payload = build_payload(model_id, transcript, voice_id, mode)
response = requests.post(CARTESIA_TTS_URL, json=payload, headers=headers)
response.raise_for_status() # Raise error for bad status codes
return response.content
def pcm_f32le_raw_to_wav(raw: bytes, path: str, sample_rate: int = 8000, channels: int = 1) -> None:
# Interpret raw bytes as float32 little-endian and shape (N, channels)
audio = np.frombuffer(raw, dtype="<f4")
if channels > 1:
audio = audio.reshape(-1, channels)
sf.write(path, audio, samplerate=sample_rate, subtype="FLOAT") # Writes IEEE float WAV
if __name__ == "__main__":
# Example usage
model = "sonic-3"
voice = "cc00e582-ed66-4004-8336-0175b85c85f6"
text = "You need to wrap the raw samples into a WAV. Two ways:"
output_path = 'speech4.wav'
result = synthesize_speech(model, text, voice)
pcm_f32le_raw_to_wav(result, output_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment