eustlb · February 25, 2025 15:07
diff --git a/.gitignore b/.gitignore
 traces-*/
diff --git a/.gitmodules b/.gitmodules
 [submodule "original-codebase"]
 	path = original-codebase
 	url = https://github.com/hexgrad/kokoro.git
diff --git a/benchmark_original.py b/benchmark_original.py
 import os 
 import sys

 current_dir = os.path.dirname(os.path.abspath(__file__))
 sys.path.insert(0, os.path.join(current_dir, "original-codebase"))

 from kokoro.model import KModel
 import torch
 import soundfile as sf

 device = "cuda:2"
 n_warmup = 3
 n_measures = 100

 model = KModel().to(device).eval()
 input_ids = torch.load(os.path.join(current_dir, "input_ids.pt")).to(device)
 ref_s = torch.load(os.path.join(current_dir, "style.pt")).to(device)

 print("Warming up...")
 for _ in range(n_warmup):
    out = model.forward_with_tokens(input_ids, ref_s, 1.0)
 print("Done warming up")

 # checking the output
 sf.write("out_original.wav", out[0].squeeze().cpu().numpy(), 24000)

 start_event = torch.cuda.Event(enable_timing=True)
 end_event = torch.cuda.Event(enable_timing=True)

 total_time = 0
 for run in range(n_measures):
    start_event.record()
    _ = model.forward_with_tokens(input_ids, ref_s, 1.0)
    end_event.record()
        
    torch.cuda.synchronize()
    run_time = start_event.elapsed_time(end_event)
    total_time += run_time

 print(f"Mean time per run: {(total_time/n_measures):.2f} ms")

diff --git a/benchmark_trfms_batched.py b/benchmark_trfms_batched.py
 from transformers import StyleTextToSpeech2Model, StyleTextToSpeech2Processor
 import soundfile as sf
 import torch

 device = "cuda:1"
 n_warmup = 3
 n_measures = 3
 batch_size = 128

 # text = [
 #     "Yes.",
 #     "Not sure.",
 #     "Sounds good, let's go!",
 #     "I'm not entirely sure yet, but that makes sense.",
 #     "That could work, but I'd need to think about it a little more.",
 #     "I see what you mean, but I'd like to consider a few other possibilities first.",
 #     "That sounds like a reasonable idea, but I'd love to explore some alternatives before deciding.",
 #     "I appreciate the suggestion, and while I think it has potential, I'd need more details to be sure.",
 #     "I see where you're coming from, and it definitely has some merit, but I'd like to weigh a few other options first.",
 #     "That's an interesting perspective, and I can understand why you'd say that, though I think there are a few nuances we should consider.",
 #     "I can definitely see the logic behind that, and I think it's a compelling idea, but I'd want to do a bit more research before fully committing to it.",
 #     "That's a valid point, and I think there's a lot of truth to what you're saying, though I'd need to analyze the potential risks before moving forward.",
 #     "I appreciate that insight, and while I agree with some aspects of it, I think there are some deeper considerations that might affect the overall outcome.",
 #     "I understand your perspective, and I think it's a solid argument, but I'd want to take some time to examine the broader implications before making a final decision.",
 #     "That's an intriguing suggestion, and I can definitely see how it could be beneficial in some situations, but I'd want to explore additional data and perspectives to get a clearer picture.",
 #     "I really appreciate the thought you put into that idea, and I think there's a lot of potential in what you're proposing, but before I can give a definitive answer, I'd need to consider all possible consequences and weigh them against our long-term goals.",
 # ]

 text = [
    "I really appreciate the thought you put into that idea, and I think there's a lot of potential in what you're proposing, but before I can give a definitive answer, I'd need to consider all possible consequences and weigh them against our long-term goals."
 ] * batch_size

 model = StyleTextToSpeech2Model.from_pretrained("eustlb/kokoro")
 processor = StyleTextToSpeech2Processor.from_pretrained("eustlb/kokoro")
 model.eval()
 model.to(device)

 inputs = processor(text, return_tensors="pt", padding=True).to(device)

 print(f"Input ids shape: {inputs.input_ids.shape}")
 batch_size = inputs.input_ids.shape[0]
 print("Warming up...")
 for _ in range(n_warmup):
    out = model.generate(**inputs)
    for i in range(batch_size):
        sf.write(f"{i}_batch.wav", out.waveform[i].cpu().numpy(), 24000)
 print("Done warming up")

 start_event = torch.cuda.Event(enable_timing=True)
 end_event = torch.cuda.Event(enable_timing=True)

 total_time = 0
 for run in range(n_measures):
    start_event.record()
    out = model.generate(**inputs)
    end_event.record()
    
    torch.cuda.synchronize()
    run_time = start_event.elapsed_time(end_event) / 1000
    total_time += run_time
    print(f"Run {run + 1} took {run_time:.2f} seconds")

 print(f"BS {batch_size}, mean time per run: {(total_time/n_measures):.2f} seconds")
diff --git a/benchmark_trfms_forward.py b/benchmark_trfms_forward.py
 import os 

 current_dir = os.path.dirname(os.path.abspath(__file__))

 from transformers import StyleTextToSpeech2Model, StyleTextToSpeech2Processor
 import torch
 import soundfile as sf

 device = "cuda:2"
 n_warmup = 3
 n_measures = 100

 model = StyleTextToSpeech2Model.from_pretrained("eustlb/kokoro")
 processor = StyleTextToSpeech2Processor.from_pretrained("eustlb/kokoro")
 model.to(device)

 input_ids = torch.load(os.path.join(current_dir, "input_ids.pt")).to(device)
 ref_s = torch.load(os.path.join(current_dir, "style.pt")).to(device)

 print("Warming up...")
 for _ in range(n_warmup):
    out = model.generate(input_ids, ref_s)
 print("Done warming up")

 # checking the output
 sf.write("out_trfms.wav", out.waveform.squeeze().cpu().numpy(), 24000)

 start_event = torch.cuda.Event(enable_timing=True)
 end_event = torch.cuda.Event(enable_timing=True)

 total_time = 0
 for run in range(n_measures):
    start_event.record()
    _ = model.generate(input_ids, ref_s)
    end_event.record()
        
    torch.cuda.synchronize()
    run_time = start_event.elapsed_time(end_event)
    total_time += run_time

 print(f"Mean time per run: {(total_time/n_measures):.2f} ms")
diff --git a/benchmark_trfms_sequential.py b/benchmark_trfms_sequential.py
 from transformers import StyleTextToSpeech2Model, StyleTextToSpeech2Processor
 import soundfile as sf
 import torch

 device = "cuda:1"
 n_warmup = 3
 n_measures = 3
 batch_size = 128

 # text = [
 #     "Yes.",
 #     "Not sure.",
 #     "Sounds good, let's go!",
 #     "I'm not entirely sure yet, but that makes sense.",
 #     "That could work, but I'd need to think about it a little more.",
 #     "I see what you mean, but I'd like to consider a few other possibilities first.",
 #     "That sounds like a reasonable idea, but I'd love to explore some alternatives before deciding.",
 #     "I appreciate the suggestion, and while I think it has potential, I'd need more details to be sure.",
 #     "I see where you're coming from, and it definitely has some merit, but I'd like to weigh a few other options first.",
 #     "That's an interesting perspective, and I can understand why you'd say that, though I think there are a few nuances we should consider.",
 #     "I can definitely see the logic behind that, and I think it's a compelling idea, but I'd want to do a bit more research before fully committing to it.",
 #     "That's a valid point, and I think there's a lot of truth to what you're saying, though I'd need to analyze the potential risks before moving forward.",
 #     "I appreciate that insight, and while I agree with some aspects of it, I think there are some deeper considerations that might affect the overall outcome.",
 #     "I understand your perspective, and I think it's a solid argument, but I'd want to take some time to examine the broader implications before making a final decision.",
 #     "That's an intriguing suggestion, and I can definitely see how it could be beneficial in some situations, but I'd want to explore additional data and perspectives to get a clearer picture.",
 #     "I really appreciate the thought you put into that idea, and I think there's a lot of potential in what you're proposing, but before I can give a definitive answer, I'd need to consider all possible consequences and weigh them against our long-term goals.",
 # ]

 text = [
    "I really appreciate the thought you put into that idea, and I think there's a lot of potential in what you're proposing, but before I can give a definitive answer, I'd need to consider all possible consequences and weigh them against our long-term goals."
 ] * batch_size

 model = StyleTextToSpeech2Model.from_pretrained("eustlb/kokoro")
 processor = StyleTextToSpeech2Processor.from_pretrained("eustlb/kokoro")
 model.eval()
 model.to(device)

 inputs = processor(text, return_tensors="pt", padding=True).to(device)

 print(f"Input ids shape: {inputs.input_ids.shape}")
 print("Warming up...")
 for _ in range(n_warmup):
    for i, (input_ids, style, attention_mask) in enumerate(zip(inputs.input_ids, inputs.style, inputs.attention_mask)):
        length = attention_mask.sum()
        input_ids = input_ids[:length]
        out = model.generate(input_ids.unsqueeze(0), style.unsqueeze(0))
        sf.write(f"{i}_sequential.wav", out.waveform.squeeze().cpu().numpy(), 24000)
        i += 1
 print("Done warming up")

 start_event = torch.cuda.Event(enable_timing=True)
 end_event = torch.cuda.Event(enable_timing=True)

 total_time = 0
 for run in range(n_measures):
    start_event.record()
    for input_ids, style, attention_mask in zip(inputs.input_ids, inputs.style, inputs.attention_mask):
        length = attention_mask.sum()
        input_ids = input_ids[:length]
        out = model.generate(input_ids.unsqueeze(0), style.unsqueeze(0))
    end_event.record()
        
    torch.cuda.synchronize()
    run_time = start_event.elapsed_time(end_event) / 1000
    total_time += run_time
    print(f"Run {run + 1} took {run_time:.2f} seconds")

 print(f"BS {batch_size}, mean time per run: {(total_time/n_measures):.2f} seconds")
diff --git a/input_ids.pt b/input_ids.pt
diff --git a/profile_original.py b/profile_original.py
 import os 
 import sys

 current_dir = os.path.dirname(os.path.abspath(__file__))
 sys.path.insert(0, os.path.join(current_dir, "original-codebase"))

 from kokoro.model import KModel
 import torch
 import soundfile as sf

 device = "cuda:2"
 n_warmup = 3
 n_measures = 100

 model = KModel().to(device).eval()
 input_ids = torch.load(os.path.join(current_dir, "input_ids.pt")).to(device)
 ref_s = torch.load(os.path.join(current_dir, "style.pt")).to(device)

 print("Warming up...")
 for _ in range(n_warmup):
    out = model.forward_with_tokens(input_ids, ref_s, 1.0)
 print("Done warming up")

 # checking the output
 sf.write("out_original.wav", out[0].squeeze().cpu().numpy(), 24000)

 start_event = torch.cuda.Event(enable_timing=True)
 end_event = torch.cuda.Event(enable_timing=True)

 total_time = 0
 for run in range(n_measures):
    start_event.record()
    out = model.forward_with_tokens(input_ids, ref_s, 1.0)
    end_event.record()

    torch.cuda.synchronize()
    run_time = start_event.elapsed_time(end_event)
    total_time += run_time

 print(f"Mean time per run: {(total_time/n_measures):.2f} ms")
    
 with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
    record_shapes=True,
    profile_memory=True,
    with_stack=True,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(os.path.join(current_dir, "traces-original"))
 ) as prof:
    out = model.forward_with_tokens(input_ids, ref_s, 1.0)
diff --git a/profile_trfms.py b/profile_trfms.py
 import os 

 current_dir = os.path.dirname(os.path.abspath(__file__))

 from transformers import StyleTextToSpeech2Model, StyleTextToSpeech2Processor
 import torch
 import soundfile as sf

 device = "cuda:2"
 n_warmup = 3
 n_measures = 100

 model = StyleTextToSpeech2Model.from_pretrained("eustlb/kokoro")
 processor = StyleTextToSpeech2Processor.from_pretrained("eustlb/kokoro")
 model.to(device)

 input_ids = torch.load(os.path.join(current_dir, "input_ids.pt")).to(device)
 ref_s = torch.load(os.path.join(current_dir, "style.pt")).to(device)

 print("Warming up...")
 for _ in range(n_warmup):
    out = model.generate(input_ids, ref_s)
 print("Done warming up")

 # checking the output
 sf.write("out_trfms.wav", out.waveform.squeeze().cpu().numpy(), 24000)

 start_event = torch.cuda.Event(enable_timing=True)
 end_event = torch.cuda.Event(enable_timing=True)

 total_time = 0
 for run in range(n_measures):
    start_event.record()
    out = model.generate(input_ids, ref_s)
    end_event.record()

    torch.cuda.synchronize()
    run_time = start_event.elapsed_time(end_event)
    total_time += run_time

 print(f"Mean time per run: {(total_time/n_measures):.2f} ms")

 with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
    record_shapes=True,
    profile_memory=True,
    with_stack=True,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(os.path.join(current_dir, "traces-trfms"))
 ) as prof:
    out = model.generate(input_ids, ref_s)
diff --git a/style.pt b/style.pt
	[submodule "original-codebase"]
	path = original-codebase
	url = https://github.com/hexgrad/kokoro.git
	import os
	import sys

	current_dir = os.path.dirname(os.path.abspath(__file__))
	sys.path.insert(0, os.path.join(current_dir, "original-codebase"))

	from kokoro.model import KModel
	import torch
	import soundfile as sf

	device = "cuda:2"
	n_warmup = 3
	n_measures = 100

	model = KModel().to(device).eval()
	input_ids = torch.load(os.path.join(current_dir, "input_ids.pt")).to(device)
	ref_s = torch.load(os.path.join(current_dir, "style.pt")).to(device)

	print("Warming up...")
	for _ in range(n_warmup):
	out = model.forward_with_tokens(input_ids, ref_s, 1.0)
	print("Done warming up")

	# checking the output
	sf.write("out_original.wav", out[0].squeeze().cpu().numpy(), 24000)

	start_event = torch.cuda.Event(enable_timing=True)
	end_event = torch.cuda.Event(enable_timing=True)

	total_time = 0
	for run in range(n_measures):
	start_event.record()
	_ = model.forward_with_tokens(input_ids, ref_s, 1.0)
	end_event.record()

	torch.cuda.synchronize()
	run_time = start_event.elapsed_time(end_event)
	total_time += run_time

	print(f"Mean time per run: {(total_time/n_measures):.2f} ms")
	from transformers import StyleTextToSpeech2Model, StyleTextToSpeech2Processor
	import soundfile as sf
	import torch

	device = "cuda:1"
	n_warmup = 3
	n_measures = 3
	batch_size = 128

	# text = [
	# "Yes.",
	# "Not sure.",
	# "Sounds good, let's go!",
	# "I'm not entirely sure yet, but that makes sense.",
	# "That could work, but I'd need to think about it a little more.",
	# "I see what you mean, but I'd like to consider a few other possibilities first.",
	# "That sounds like a reasonable idea, but I'd love to explore some alternatives before deciding.",
	# "I appreciate the suggestion, and while I think it has potential, I'd need more details to be sure.",
	# "I see where you're coming from, and it definitely has some merit, but I'd like to weigh a few other options first.",
	# "That's an interesting perspective, and I can understand why you'd say that, though I think there are a few nuances we should consider.",
	# "I can definitely see the logic behind that, and I think it's a compelling idea, but I'd want to do a bit more research before fully committing to it.",
	# "That's a valid point, and I think there's a lot of truth to what you're saying, though I'd need to analyze the potential risks before moving forward.",
	# "I appreciate that insight, and while I agree with some aspects of it, I think there are some deeper considerations that might affect the overall outcome.",
	# "I understand your perspective, and I think it's a solid argument, but I'd want to take some time to examine the broader implications before making a final decision.",
	# "That's an intriguing suggestion, and I can definitely see how it could be beneficial in some situations, but I'd want to explore additional data and perspectives to get a clearer picture.",
	# "I really appreciate the thought you put into that idea, and I think there's a lot of potential in what you're proposing, but before I can give a definitive answer, I'd need to consider all possible consequences and weigh them against our long-term goals.",
	# ]

	text = [
	"I really appreciate the thought you put into that idea, and I think there's a lot of potential in what you're proposing, but before I can give a definitive answer, I'd need to consider all possible consequences and weigh them against our long-term goals."
	] * batch_size

	model = StyleTextToSpeech2Model.from_pretrained("eustlb/kokoro")
	processor = StyleTextToSpeech2Processor.from_pretrained("eustlb/kokoro")
	model.eval()
	model.to(device)

	inputs = processor(text, return_tensors="pt", padding=True).to(device)

	print(f"Input ids shape: {inputs.input_ids.shape}")
	batch_size = inputs.input_ids.shape[0]
	print("Warming up...")
	for _ in range(n_warmup):
	out = model.generate(**inputs)
	for i in range(batch_size):
	sf.write(f"{i}_batch.wav", out.waveform[i].cpu().numpy(), 24000)
	print("Done warming up")

	start_event = torch.cuda.Event(enable_timing=True)
	end_event = torch.cuda.Event(enable_timing=True)

	total_time = 0
	for run in range(n_measures):
	start_event.record()
	out = model.generate(**inputs)
	end_event.record()

	torch.cuda.synchronize()
	run_time = start_event.elapsed_time(end_event) / 1000
	total_time += run_time
	print(f"Run {run + 1} took {run_time:.2f} seconds")

	print(f"BS {batch_size}, mean time per run: {(total_time/n_measures):.2f} seconds")