Created
November 3, 2023 18:04
-
-
Save ChadDevOps/7dc4a42a6c54f79697d33ecc31bc60df to your computer and use it in GitHub Desktop.
Convert audio files to txt using openai's faster whisper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://github.com/guillaumekln/faster-whisper | |
# pip install faster-whisper | |
# pip install nvidia-cublas-cu11 nvidia-cudnn-cu11 | |
# pip install torch | |
# Export your library prior to running in WSL or virtual env | |
# export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'` | |
import os | |
import torch, gc | |
import shutil | |
# get path | |
# python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))' | |
os.environ['LD_LIBRARY_PATH'] = "/usr/local/lib/python3.8/dist-packages/nvidia/cublas/lib:/usr/local/lib/python3.8/dist-packages/nvidia/cudnn/lib" | |
from faster_whisper import WhisperModel | |
model_size = "large-v2" | |
#List of valid file extensions | |
valid_extensions = (".WAV", ".mp3") | |
# Run on GPU with FP32 | |
model = WhisperModel(model_size, device="cuda", compute_type="float32") | |
# or run on GPU with INT8, float16, float32 | |
# model = WhisperModel(model_size, device="cuda", compute_type="int8") | |
# or run on CPU with INT8 | |
# model = WhisperModel(model_size, device="cpu", compute_type="int8") | |
# Replace 'your_directory' with the actual path to your audio files directory | |
audio_directory = '/mnt/d/Recordings' | |
destination_directory = '/mnt/d/Recordings/converted' | |
for audio_file in os.listdir(audio_directory): | |
if audio_file.endswith(valid_extensions): | |
source_file = os.path.join(audio_directory, audio_file) | |
txt_file = audio_file + ".txt" | |
output_file = os.path.join(destination_directory, txt_file) | |
segments, info = model.transcribe(source_file, beam_size=5, vad_filter=True) | |
# You can process the segments and info here as needed | |
print(f"Transcribing {audio_file}...") | |
print("Detected language '%s' with probability %f" % (info.language, info.language_probability)) | |
# Open the file in write mode | |
with open(output_file, 'w') as file: | |
for segment in segments: | |
start_seconds = segment.start | |
end_seconds = segment.end | |
start_hours = int(start_seconds // 3600) | |
start_minutes = int((start_seconds % 3600) // 60) | |
start_seconds = start_seconds % 60 | |
end_hours = int(end_seconds // 3600) | |
end_minutes = int((end_seconds % 3600) // 60) | |
end_seconds = end_seconds % 60 | |
# Your code for generating the output | |
output = "[%02d:%02d:%02d -> %02d:%02d:%02d] %s" % (start_hours, start_minutes, start_seconds, end_hours, end_minutes, end_seconds, segment.text) | |
print(output) | |
# Open the file in write mode and save the output | |
output += '\n' | |
file.write(output) | |
print(f"Output saved to {output_file}") | |
shutil.move(source_file, destination_directory) | |
del segments | |
del info | |
torch.cuda.empty_cache() | |
del model |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment