Last active
August 13, 2023 14:16
-
-
Save Hansimov/07207ffce714614ac0fff5846b82a157 to your computer and use it in GitHub Desktop.
Speech to Text with Whisper from OpenAI with Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import whisper | |
from pathlib import Path | |
from termcolor import colored | |
class SpeechToTextConverter: | |
def __init__(self, folders=["."], exts=[".mp3"], files=None): | |
self.folders = folders | |
self.files = files | |
self.exts = exts | |
def get_filepaths(self): | |
if type(self.exts) is str: | |
self.exts = [self.exts] | |
if type(self.folders) is str: | |
self.folders = [self.folders] | |
if type(self.exts) is str: | |
self.exts = [self.exts] | |
self.filepaths = [] | |
for folder in self.folders: | |
for ext in self.exts: | |
self.filepaths.extend(Path(folder).rglob(f"*{ext}")) | |
print( | |
colored( | |
f"Following {len(self.filepaths)} files will be converted:", | |
"light_magenta", | |
) | |
) | |
for filepath in self.filepaths: | |
print(f" > {filepath}") | |
def seconds_to_timestamp_str(self, s): | |
minutes = int(s / 60) | |
seconds = int(s) % 60 | |
milliseconds = int((s - int(s)) * 1000) | |
return f"{minutes:02}:{seconds:02}.{milliseconds:03}" | |
def format_segments(self, segments): | |
texts = [] | |
for segment in segments: | |
text = segment["text"] | |
start = segment["start"] | |
end = segment["end"] | |
start_timestamp_str = self.seconds_to_timestamp_str(start) | |
end_timestamp_str = self.seconds_to_timestamp_str(end) | |
line = f"[{start_timestamp_str} --> {end_timestamp_str}] {text}" | |
texts.append(text) | |
return "\n".join(texts) | |
def convert( | |
self, | |
filepath, | |
model_name="small", | |
language="en", | |
output_ext=".txt", | |
output_filepath=None, | |
): | |
self.model = whisper.load_model(name=model_name) | |
if not output_filepath: | |
output_filepath = Path(filepath).with_suffix("").with_suffix(output_ext) | |
filepath = str(filepath) | |
print(colored(f"Converting: [{filepath}]", "light_cyan")) | |
result = self.model.transcribe(filepath, verbose=True, language=language) | |
text = self.format_segments(result["segments"]) | |
with open(output_filepath, "w") as wf: | |
wf.write(text) | |
print(colored(f"Dumped: [{output_filepath}]", "light_green")) | |
def run(self): | |
self.get_filepaths() | |
for filepath in self.filepaths: | |
self.convert(filepath) | |
def check_cuda(): | |
cuda_available = torch.cuda.is_available() | |
cuda_version = torch.version.cuda | |
print(colored(f"CUDA {cuda_version} Enabled: {cuda_available} ", "light_green")) | |
if __name__ == "__main__": | |
check_cuda() | |
converter = SpeechToTextConverter(folders=["4th_1", "4th_2"], exts=[".mp3"]) | |
converter.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment