Last active
February 25, 2025 15:07
-
-
Save eustlb/5a9092a1635c967dfb908cc2ac757780 to your computer and use it in GitHub Desktop.
Benchmark seq vs batched - Style TTS2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| traces-*/ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [submodule "original-codebase"] | |
| path = original-codebase | |
| url = https://github.com/hexgrad/kokoro.git |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import sys | |
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |
| sys.path.insert(0, os.path.join(current_dir, "original-codebase")) | |
| from kokoro.model import KModel | |
| import torch | |
| import soundfile as sf | |
| device = "cuda:2" | |
| n_warmup = 3 | |
| n_measures = 100 | |
| model = KModel().to(device).eval() | |
| input_ids = torch.load(os.path.join(current_dir, "input_ids.pt")).to(device) | |
| ref_s = torch.load(os.path.join(current_dir, "style.pt")).to(device) | |
| print("Warming up...") | |
| for _ in range(n_warmup): | |
| out = model.forward_with_tokens(input_ids, ref_s, 1.0) | |
| print("Done warming up") | |
| # checking the output | |
| sf.write("out_original.wav", out[0].squeeze().cpu().numpy(), 24000) | |
| start_event = torch.cuda.Event(enable_timing=True) | |
| end_event = torch.cuda.Event(enable_timing=True) | |
| total_time = 0 | |
| for run in range(n_measures): | |
| start_event.record() | |
| _ = model.forward_with_tokens(input_ids, ref_s, 1.0) | |
| end_event.record() | |
| torch.cuda.synchronize() | |
| run_time = start_event.elapsed_time(end_event) | |
| total_time += run_time | |
| print(f"Mean time per run: {(total_time/n_measures):.2f} ms") | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from transformers import StyleTextToSpeech2Model, StyleTextToSpeech2Processor | |
| import soundfile as sf | |
| import torch | |
| device = "cuda:1" | |
| n_warmup = 3 | |
| n_measures = 3 | |
| batch_size = 128 | |
| # text = [ | |
| # "Yes.", | |
| # "Not sure.", | |
| # "Sounds good, let's go!", | |
| # "I'm not entirely sure yet, but that makes sense.", | |
| # "That could work, but I'd need to think about it a little more.", | |
| # "I see what you mean, but I'd like to consider a few other possibilities first.", | |
| # "That sounds like a reasonable idea, but I'd love to explore some alternatives before deciding.", | |
| # "I appreciate the suggestion, and while I think it has potential, I'd need more details to be sure.", | |
| # "I see where you're coming from, and it definitely has some merit, but I'd like to weigh a few other options first.", | |
| # "That's an interesting perspective, and I can understand why you'd say that, though I think there are a few nuances we should consider.", | |
| # "I can definitely see the logic behind that, and I think it's a compelling idea, but I'd want to do a bit more research before fully committing to it.", | |
| # "That's a valid point, and I think there's a lot of truth to what you're saying, though I'd need to analyze the potential risks before moving forward.", | |
| # "I appreciate that insight, and while I agree with some aspects of it, I think there are some deeper considerations that might affect the overall outcome.", | |
| # "I understand your perspective, and I think it's a solid argument, but I'd want to take some time to examine the broader implications before making a final decision.", | |
| # "That's an intriguing suggestion, and I can definitely see how it could be beneficial in some situations, but I'd want to explore additional data and perspectives to get a clearer picture.", | |
| # "I really appreciate the thought you put into that idea, and I think there's a lot of potential in what you're proposing, but before I can give a definitive answer, I'd need to consider all possible consequences and weigh them against our long-term goals.", | |
| # ] | |
| text = [ | |
| "I really appreciate the thought you put into that idea, and I think there's a lot of potential in what you're proposing, but before I can give a definitive answer, I'd need to consider all possible consequences and weigh them against our long-term goals." | |
| ] * batch_size | |
| model = StyleTextToSpeech2Model.from_pretrained("eustlb/kokoro") | |
| processor = StyleTextToSpeech2Processor.from_pretrained("eustlb/kokoro") | |
| model.eval() | |
| model.to(device) | |
| inputs = processor(text, return_tensors="pt", padding=True).to(device) | |
| print(f"Input ids shape: {inputs.input_ids.shape}") | |
| batch_size = inputs.input_ids.shape[0] | |
| print("Warming up...") | |
| for _ in range(n_warmup): | |
| out = model.generate(**inputs) | |
| for i in range(batch_size): | |
| sf.write(f"{i}_batch.wav", out.waveform[i].cpu().numpy(), 24000) | |
| print("Done warming up") | |
| start_event = torch.cuda.Event(enable_timing=True) | |
| end_event = torch.cuda.Event(enable_timing=True) | |
| total_time = 0 | |
| for run in range(n_measures): | |
| start_event.record() | |
| out = model.generate(**inputs) | |
| end_event.record() | |
| torch.cuda.synchronize() | |
| run_time = start_event.elapsed_time(end_event) / 1000 | |
| total_time += run_time | |
| print(f"Run {run + 1} took {run_time:.2f} seconds") | |
| print(f"BS {batch_size}, mean time per run: {(total_time/n_measures):.2f} seconds") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |
| from transformers import StyleTextToSpeech2Model, StyleTextToSpeech2Processor | |
| import torch | |
| import soundfile as sf | |
| device = "cuda:2" | |
| n_warmup = 3 | |
| n_measures = 100 | |
| model = StyleTextToSpeech2Model.from_pretrained("eustlb/kokoro") | |
| processor = StyleTextToSpeech2Processor.from_pretrained("eustlb/kokoro") | |
| model.to(device) | |
| input_ids = torch.load(os.path.join(current_dir, "input_ids.pt")).to(device) | |
| ref_s = torch.load(os.path.join(current_dir, "style.pt")).to(device) | |
| print("Warming up...") | |
| for _ in range(n_warmup): | |
| out = model.generate(input_ids, ref_s) | |
| print("Done warming up") | |
| # checking the output | |
| sf.write("out_trfms.wav", out.waveform.squeeze().cpu().numpy(), 24000) | |
| start_event = torch.cuda.Event(enable_timing=True) | |
| end_event = torch.cuda.Event(enable_timing=True) | |
| total_time = 0 | |
| for run in range(n_measures): | |
| start_event.record() | |
| _ = model.generate(input_ids, ref_s) | |
| end_event.record() | |
| torch.cuda.synchronize() | |
| run_time = start_event.elapsed_time(end_event) | |
| total_time += run_time | |
| print(f"Mean time per run: {(total_time/n_measures):.2f} ms") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from transformers import StyleTextToSpeech2Model, StyleTextToSpeech2Processor | |
| import soundfile as sf | |
| import torch | |
| device = "cuda:1" | |
| n_warmup = 3 | |
| n_measures = 3 | |
| batch_size = 128 | |
| # text = [ | |
| # "Yes.", | |
| # "Not sure.", | |
| # "Sounds good, let's go!", | |
| # "I'm not entirely sure yet, but that makes sense.", | |
| # "That could work, but I'd need to think about it a little more.", | |
| # "I see what you mean, but I'd like to consider a few other possibilities first.", | |
| # "That sounds like a reasonable idea, but I'd love to explore some alternatives before deciding.", | |
| # "I appreciate the suggestion, and while I think it has potential, I'd need more details to be sure.", | |
| # "I see where you're coming from, and it definitely has some merit, but I'd like to weigh a few other options first.", | |
| # "That's an interesting perspective, and I can understand why you'd say that, though I think there are a few nuances we should consider.", | |
| # "I can definitely see the logic behind that, and I think it's a compelling idea, but I'd want to do a bit more research before fully committing to it.", | |
| # "That's a valid point, and I think there's a lot of truth to what you're saying, though I'd need to analyze the potential risks before moving forward.", | |
| # "I appreciate that insight, and while I agree with some aspects of it, I think there are some deeper considerations that might affect the overall outcome.", | |
| # "I understand your perspective, and I think it's a solid argument, but I'd want to take some time to examine the broader implications before making a final decision.", | |
| # "That's an intriguing suggestion, and I can definitely see how it could be beneficial in some situations, but I'd want to explore additional data and perspectives to get a clearer picture.", | |
| # "I really appreciate the thought you put into that idea, and I think there's a lot of potential in what you're proposing, but before I can give a definitive answer, I'd need to consider all possible consequences and weigh them against our long-term goals.", | |
| # ] | |
| text = [ | |
| "I really appreciate the thought you put into that idea, and I think there's a lot of potential in what you're proposing, but before I can give a definitive answer, I'd need to consider all possible consequences and weigh them against our long-term goals." | |
| ] * batch_size | |
| model = StyleTextToSpeech2Model.from_pretrained("eustlb/kokoro") | |
| processor = StyleTextToSpeech2Processor.from_pretrained("eustlb/kokoro") | |
| model.eval() | |
| model.to(device) | |
| inputs = processor(text, return_tensors="pt", padding=True).to(device) | |
| print(f"Input ids shape: {inputs.input_ids.shape}") | |
| print("Warming up...") | |
| for _ in range(n_warmup): | |
| for i, (input_ids, style, attention_mask) in enumerate(zip(inputs.input_ids, inputs.style, inputs.attention_mask)): | |
| length = attention_mask.sum() | |
| input_ids = input_ids[:length] | |
| out = model.generate(input_ids.unsqueeze(0), style.unsqueeze(0)) | |
| sf.write(f"{i}_sequential.wav", out.waveform.squeeze().cpu().numpy(), 24000) | |
| i += 1 | |
| print("Done warming up") | |
| start_event = torch.cuda.Event(enable_timing=True) | |
| end_event = torch.cuda.Event(enable_timing=True) | |
| total_time = 0 | |
| for run in range(n_measures): | |
| start_event.record() | |
| for input_ids, style, attention_mask in zip(inputs.input_ids, inputs.style, inputs.attention_mask): | |
| length = attention_mask.sum() | |
| input_ids = input_ids[:length] | |
| out = model.generate(input_ids.unsqueeze(0), style.unsqueeze(0)) | |
| end_event.record() | |
| torch.cuda.synchronize() | |
| run_time = start_event.elapsed_time(end_event) / 1000 | |
| total_time += run_time | |
| print(f"Run {run + 1} took {run_time:.2f} seconds") | |
| print(f"BS {batch_size}, mean time per run: {(total_time/n_measures):.2f} seconds") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import sys | |
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |
| sys.path.insert(0, os.path.join(current_dir, "original-codebase")) | |
| from kokoro.model import KModel | |
| import torch | |
| import soundfile as sf | |
| device = "cuda:2" | |
| n_warmup = 3 | |
| n_measures = 100 | |
| model = KModel().to(device).eval() | |
| input_ids = torch.load(os.path.join(current_dir, "input_ids.pt")).to(device) | |
| ref_s = torch.load(os.path.join(current_dir, "style.pt")).to(device) | |
| print("Warming up...") | |
| for _ in range(n_warmup): | |
| out = model.forward_with_tokens(input_ids, ref_s, 1.0) | |
| print("Done warming up") | |
| # checking the output | |
| sf.write("out_original.wav", out[0].squeeze().cpu().numpy(), 24000) | |
| start_event = torch.cuda.Event(enable_timing=True) | |
| end_event = torch.cuda.Event(enable_timing=True) | |
| total_time = 0 | |
| for run in range(n_measures): | |
| start_event.record() | |
| out = model.forward_with_tokens(input_ids, ref_s, 1.0) | |
| end_event.record() | |
| torch.cuda.synchronize() | |
| run_time = start_event.elapsed_time(end_event) | |
| total_time += run_time | |
| print(f"Mean time per run: {(total_time/n_measures):.2f} ms") | |
| with torch.profiler.profile( | |
| activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA], | |
| record_shapes=True, | |
| profile_memory=True, | |
| with_stack=True, | |
| on_trace_ready=torch.profiler.tensorboard_trace_handler(os.path.join(current_dir, "traces-original")) | |
| ) as prof: | |
| out = model.forward_with_tokens(input_ids, ref_s, 1.0) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |
| from transformers import StyleTextToSpeech2Model, StyleTextToSpeech2Processor | |
| import torch | |
| import soundfile as sf | |
| device = "cuda:2" | |
| n_warmup = 3 | |
| n_measures = 100 | |
| model = StyleTextToSpeech2Model.from_pretrained("eustlb/kokoro") | |
| processor = StyleTextToSpeech2Processor.from_pretrained("eustlb/kokoro") | |
| model.to(device) | |
| input_ids = torch.load(os.path.join(current_dir, "input_ids.pt")).to(device) | |
| ref_s = torch.load(os.path.join(current_dir, "style.pt")).to(device) | |
| print("Warming up...") | |
| for _ in range(n_warmup): | |
| out = model.generate(input_ids, ref_s) | |
| print("Done warming up") | |
| # checking the output | |
| sf.write("out_trfms.wav", out.waveform.squeeze().cpu().numpy(), 24000) | |
| start_event = torch.cuda.Event(enable_timing=True) | |
| end_event = torch.cuda.Event(enable_timing=True) | |
| total_time = 0 | |
| for run in range(n_measures): | |
| start_event.record() | |
| out = model.generate(input_ids, ref_s) | |
| end_event.record() | |
| torch.cuda.synchronize() | |
| run_time = start_event.elapsed_time(end_event) | |
| total_time += run_time | |
| print(f"Mean time per run: {(total_time/n_measures):.2f} ms") | |
| with torch.profiler.profile( | |
| activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA], | |
| record_shapes=True, | |
| profile_memory=True, | |
| with_stack=True, | |
| on_trace_ready=torch.profiler.tensorboard_trace_handler(os.path.join(current_dir, "traces-trfms")) | |
| ) as prof: | |
| out = model.generate(input_ids, ref_s) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment