Skip to content

Instantly share code, notes, and snippets.

@eustlb
Last active February 25, 2025 15:07
Show Gist options
  • Select an option

  • Save eustlb/5a9092a1635c967dfb908cc2ac757780 to your computer and use it in GitHub Desktop.

Select an option

Save eustlb/5a9092a1635c967dfb908cc2ac757780 to your computer and use it in GitHub Desktop.
Benchmark seq vs batched - Style TTS2
[submodule "original-codebase"]
path = original-codebase
url = https://github.com/hexgrad/kokoro.git
import os
import sys
current_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.join(current_dir, "original-codebase"))
from kokoro.model import KModel
import torch
import soundfile as sf
device = "cuda:2"
n_warmup = 3
n_measures = 100
model = KModel().to(device).eval()
input_ids = torch.load(os.path.join(current_dir, "input_ids.pt")).to(device)
ref_s = torch.load(os.path.join(current_dir, "style.pt")).to(device)
print("Warming up...")
for _ in range(n_warmup):
out = model.forward_with_tokens(input_ids, ref_s, 1.0)
print("Done warming up")
# checking the output
sf.write("out_original.wav", out[0].squeeze().cpu().numpy(), 24000)
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
total_time = 0
for run in range(n_measures):
start_event.record()
_ = model.forward_with_tokens(input_ids, ref_s, 1.0)
end_event.record()
torch.cuda.synchronize()
run_time = start_event.elapsed_time(end_event)
total_time += run_time
print(f"Mean time per run: {(total_time/n_measures):.2f} ms")
from transformers import StyleTextToSpeech2Model, StyleTextToSpeech2Processor
import soundfile as sf
import torch
device = "cuda:1"
n_warmup = 3
n_measures = 3
batch_size = 128
# text = [
# "Yes.",
# "Not sure.",
# "Sounds good, let's go!",
# "I'm not entirely sure yet, but that makes sense.",
# "That could work, but I'd need to think about it a little more.",
# "I see what you mean, but I'd like to consider a few other possibilities first.",
# "That sounds like a reasonable idea, but I'd love to explore some alternatives before deciding.",
# "I appreciate the suggestion, and while I think it has potential, I'd need more details to be sure.",
# "I see where you're coming from, and it definitely has some merit, but I'd like to weigh a few other options first.",
# "That's an interesting perspective, and I can understand why you'd say that, though I think there are a few nuances we should consider.",
# "I can definitely see the logic behind that, and I think it's a compelling idea, but I'd want to do a bit more research before fully committing to it.",
# "That's a valid point, and I think there's a lot of truth to what you're saying, though I'd need to analyze the potential risks before moving forward.",
# "I appreciate that insight, and while I agree with some aspects of it, I think there are some deeper considerations that might affect the overall outcome.",
# "I understand your perspective, and I think it's a solid argument, but I'd want to take some time to examine the broader implications before making a final decision.",
# "That's an intriguing suggestion, and I can definitely see how it could be beneficial in some situations, but I'd want to explore additional data and perspectives to get a clearer picture.",
# "I really appreciate the thought you put into that idea, and I think there's a lot of potential in what you're proposing, but before I can give a definitive answer, I'd need to consider all possible consequences and weigh them against our long-term goals.",
# ]
text = [
"I really appreciate the thought you put into that idea, and I think there's a lot of potential in what you're proposing, but before I can give a definitive answer, I'd need to consider all possible consequences and weigh them against our long-term goals."
] * batch_size
model = StyleTextToSpeech2Model.from_pretrained("eustlb/kokoro")
processor = StyleTextToSpeech2Processor.from_pretrained("eustlb/kokoro")
model.eval()
model.to(device)
inputs = processor(text, return_tensors="pt", padding=True).to(device)
print(f"Input ids shape: {inputs.input_ids.shape}")
batch_size = inputs.input_ids.shape[0]
print("Warming up...")
for _ in range(n_warmup):
out = model.generate(**inputs)
for i in range(batch_size):
sf.write(f"{i}_batch.wav", out.waveform[i].cpu().numpy(), 24000)
print("Done warming up")
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
total_time = 0
for run in range(n_measures):
start_event.record()
out = model.generate(**inputs)
end_event.record()
torch.cuda.synchronize()
run_time = start_event.elapsed_time(end_event) / 1000
total_time += run_time
print(f"Run {run + 1} took {run_time:.2f} seconds")
print(f"BS {batch_size}, mean time per run: {(total_time/n_measures):.2f} seconds")
import os
current_dir = os.path.dirname(os.path.abspath(__file__))
from transformers import StyleTextToSpeech2Model, StyleTextToSpeech2Processor
import torch
import soundfile as sf
device = "cuda:2"
n_warmup = 3
n_measures = 100
model = StyleTextToSpeech2Model.from_pretrained("eustlb/kokoro")
processor = StyleTextToSpeech2Processor.from_pretrained("eustlb/kokoro")
model.to(device)
input_ids = torch.load(os.path.join(current_dir, "input_ids.pt")).to(device)
ref_s = torch.load(os.path.join(current_dir, "style.pt")).to(device)
print("Warming up...")
for _ in range(n_warmup):
out = model.generate(input_ids, ref_s)
print("Done warming up")
# checking the output
sf.write("out_trfms.wav", out.waveform.squeeze().cpu().numpy(), 24000)
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
total_time = 0
for run in range(n_measures):
start_event.record()
_ = model.generate(input_ids, ref_s)
end_event.record()
torch.cuda.synchronize()
run_time = start_event.elapsed_time(end_event)
total_time += run_time
print(f"Mean time per run: {(total_time/n_measures):.2f} ms")
from transformers import StyleTextToSpeech2Model, StyleTextToSpeech2Processor
import soundfile as sf
import torch
device = "cuda:1"
n_warmup = 3
n_measures = 3
batch_size = 128
# text = [
# "Yes.",
# "Not sure.",
# "Sounds good, let's go!",
# "I'm not entirely sure yet, but that makes sense.",
# "That could work, but I'd need to think about it a little more.",
# "I see what you mean, but I'd like to consider a few other possibilities first.",
# "That sounds like a reasonable idea, but I'd love to explore some alternatives before deciding.",
# "I appreciate the suggestion, and while I think it has potential, I'd need more details to be sure.",
# "I see where you're coming from, and it definitely has some merit, but I'd like to weigh a few other options first.",
# "That's an interesting perspective, and I can understand why you'd say that, though I think there are a few nuances we should consider.",
# "I can definitely see the logic behind that, and I think it's a compelling idea, but I'd want to do a bit more research before fully committing to it.",
# "That's a valid point, and I think there's a lot of truth to what you're saying, though I'd need to analyze the potential risks before moving forward.",
# "I appreciate that insight, and while I agree with some aspects of it, I think there are some deeper considerations that might affect the overall outcome.",
# "I understand your perspective, and I think it's a solid argument, but I'd want to take some time to examine the broader implications before making a final decision.",
# "That's an intriguing suggestion, and I can definitely see how it could be beneficial in some situations, but I'd want to explore additional data and perspectives to get a clearer picture.",
# "I really appreciate the thought you put into that idea, and I think there's a lot of potential in what you're proposing, but before I can give a definitive answer, I'd need to consider all possible consequences and weigh them against our long-term goals.",
# ]
text = [
"I really appreciate the thought you put into that idea, and I think there's a lot of potential in what you're proposing, but before I can give a definitive answer, I'd need to consider all possible consequences and weigh them against our long-term goals."
] * batch_size
model = StyleTextToSpeech2Model.from_pretrained("eustlb/kokoro")
processor = StyleTextToSpeech2Processor.from_pretrained("eustlb/kokoro")
model.eval()
model.to(device)
inputs = processor(text, return_tensors="pt", padding=True).to(device)
print(f"Input ids shape: {inputs.input_ids.shape}")
print("Warming up...")
for _ in range(n_warmup):
for i, (input_ids, style, attention_mask) in enumerate(zip(inputs.input_ids, inputs.style, inputs.attention_mask)):
length = attention_mask.sum()
input_ids = input_ids[:length]
out = model.generate(input_ids.unsqueeze(0), style.unsqueeze(0))
sf.write(f"{i}_sequential.wav", out.waveform.squeeze().cpu().numpy(), 24000)
i += 1
print("Done warming up")
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
total_time = 0
for run in range(n_measures):
start_event.record()
for input_ids, style, attention_mask in zip(inputs.input_ids, inputs.style, inputs.attention_mask):
length = attention_mask.sum()
input_ids = input_ids[:length]
out = model.generate(input_ids.unsqueeze(0), style.unsqueeze(0))
end_event.record()
torch.cuda.synchronize()
run_time = start_event.elapsed_time(end_event) / 1000
total_time += run_time
print(f"Run {run + 1} took {run_time:.2f} seconds")
print(f"BS {batch_size}, mean time per run: {(total_time/n_measures):.2f} seconds")
import os
import sys
current_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.join(current_dir, "original-codebase"))
from kokoro.model import KModel
import torch
import soundfile as sf
device = "cuda:2"
n_warmup = 3
n_measures = 100
model = KModel().to(device).eval()
input_ids = torch.load(os.path.join(current_dir, "input_ids.pt")).to(device)
ref_s = torch.load(os.path.join(current_dir, "style.pt")).to(device)
print("Warming up...")
for _ in range(n_warmup):
out = model.forward_with_tokens(input_ids, ref_s, 1.0)
print("Done warming up")
# checking the output
sf.write("out_original.wav", out[0].squeeze().cpu().numpy(), 24000)
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
total_time = 0
for run in range(n_measures):
start_event.record()
out = model.forward_with_tokens(input_ids, ref_s, 1.0)
end_event.record()
torch.cuda.synchronize()
run_time = start_event.elapsed_time(end_event)
total_time += run_time
print(f"Mean time per run: {(total_time/n_measures):.2f} ms")
with torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
record_shapes=True,
profile_memory=True,
with_stack=True,
on_trace_ready=torch.profiler.tensorboard_trace_handler(os.path.join(current_dir, "traces-original"))
) as prof:
out = model.forward_with_tokens(input_ids, ref_s, 1.0)
import os
current_dir = os.path.dirname(os.path.abspath(__file__))
from transformers import StyleTextToSpeech2Model, StyleTextToSpeech2Processor
import torch
import soundfile as sf
device = "cuda:2"
n_warmup = 3
n_measures = 100
model = StyleTextToSpeech2Model.from_pretrained("eustlb/kokoro")
processor = StyleTextToSpeech2Processor.from_pretrained("eustlb/kokoro")
model.to(device)
input_ids = torch.load(os.path.join(current_dir, "input_ids.pt")).to(device)
ref_s = torch.load(os.path.join(current_dir, "style.pt")).to(device)
print("Warming up...")
for _ in range(n_warmup):
out = model.generate(input_ids, ref_s)
print("Done warming up")
# checking the output
sf.write("out_trfms.wav", out.waveform.squeeze().cpu().numpy(), 24000)
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
total_time = 0
for run in range(n_measures):
start_event.record()
out = model.generate(input_ids, ref_s)
end_event.record()
torch.cuda.synchronize()
run_time = start_event.elapsed_time(end_event)
total_time += run_time
print(f"Mean time per run: {(total_time/n_measures):.2f} ms")
with torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
record_shapes=True,
profile_memory=True,
with_stack=True,
on_trace_ready=torch.profiler.tensorboard_trace_handler(os.path.join(current_dir, "traces-trfms"))
) as prof:
out = model.generate(input_ids, ref_s)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment