Created
November 2, 2025 13:55
-
-
Save the-code-rider/467391ec41a27a4da1821814227e79a9 to your computer and use it in GitHub Desktop.
cartesia tts script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import requests | |
| from dotenv import load_dotenv | |
| import numpy as np | |
| import soundfile as sf | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| # Base URL for Cartesia API | |
| CARTESIA_TTS_URL = "https://api.cartesia.ai/tts/bytes" | |
| def get_headers(): | |
| """Return headers for Cartesia API.""" | |
| return { | |
| "Cartesia-Version": os.getenv("CARTESIA_VERSION", "2024-01-01"), | |
| "Authorization": f"Bearer {os.getenv('CARTESIA_API_TOKEN')}", | |
| "Content-Type": "application/json" | |
| } | |
| def build_payload(model_id: str, transcript: str, voice_id: str, mode: str = "id"): | |
| """Build the JSON payload for TTS request.""" | |
| return { | |
| "model_id": model_id, | |
| "transcript": transcript, | |
| "voice": { | |
| "mode": mode, | |
| "id": voice_id | |
| }, | |
| "language": "en", | |
| "generation_config": { | |
| "volume": 1, | |
| "speed": 1, | |
| "emotion": "neutral" | |
| }, | |
| "output_format": { | |
| "container": "raw", | |
| "encoding": "pcm_f32le", | |
| "sample_rate": 8000 | |
| }, | |
| "save": False, | |
| "speed": "normal" | |
| } | |
| def synthesize_speech(model_id: str, transcript: str, voice_id: str, mode: str = "id"): | |
| """Call Cartesia API and return the response.""" | |
| headers = get_headers() | |
| payload = build_payload(model_id, transcript, voice_id, mode) | |
| response = requests.post(CARTESIA_TTS_URL, json=payload, headers=headers) | |
| response.raise_for_status() # Raise error for bad status codes | |
| return response.content | |
| def pcm_f32le_raw_to_wav(raw: bytes, path: str, sample_rate: int = 8000, channels: int = 1) -> None: | |
| # Interpret raw bytes as float32 little-endian and shape (N, channels) | |
| audio = np.frombuffer(raw, dtype="<f4") | |
| if channels > 1: | |
| audio = audio.reshape(-1, channels) | |
| sf.write(path, audio, samplerate=sample_rate, subtype="FLOAT") # Writes IEEE float WAV | |
| if __name__ == "__main__": | |
| # Example usage | |
| model = "sonic-3" | |
| voice = "cc00e582-ed66-4004-8336-0175b85c85f6" | |
| text = "You need to wrap the raw samples into a WAV. Two ways:" | |
| output_path = 'speech4.wav' | |
| result = synthesize_speech(model, text, voice) | |
| pcm_f32le_raw_to_wav(result, output_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment