-
-
Save juanmc2005/ed6413e697e176cb36a149d8c40a3a5b to your computer and use it in GitHub Desktop.
import logging | |
import os | |
import sys | |
import traceback | |
from contextlib import contextmanager | |
import diart.operators as dops | |
import numpy as np | |
import rich | |
import rx.operators as ops | |
import whisper_timestamped as whisper | |
from diart import SpeakerDiarization, SpeakerDiarizationConfig | |
from diart.sources import MicrophoneAudioSource | |
from pyannote.core import Annotation, SlidingWindowFeature, SlidingWindow, Segment | |
def concat(chunks, collar=0.05): | |
""" | |
Concatenate predictions and audio | |
given a list of `(diarization, waveform)` pairs | |
and merge contiguous single-speaker regions | |
with pauses shorter than `collar` seconds. | |
""" | |
first_annotation = chunks[0][0] | |
first_waveform = chunks[0][1] | |
annotation = Annotation(uri=first_annotation.uri) | |
data = [] | |
for ann, wav in chunks: | |
annotation.update(ann) | |
data.append(wav.data) | |
annotation = annotation.support(collar) | |
window = SlidingWindow( | |
first_waveform.sliding_window.duration, | |
first_waveform.sliding_window.step, | |
first_waveform.sliding_window.start, | |
) | |
data = np.concatenate(data, axis=0) | |
return annotation, SlidingWindowFeature(data, window) | |
def colorize_transcription(transcription): | |
""" | |
Unify a speaker-aware transcription represented as | |
a list of `(speaker: int, text: str)` pairs | |
into a single text colored by speakers. | |
""" | |
colors = 2 * [ | |
"bright_red", "bright_blue", "bright_green", "orange3", "deep_pink1", | |
"yellow2", "magenta", "cyan", "bright_magenta", "dodger_blue2" | |
] | |
result = [] | |
for speaker, text in transcription: | |
if speaker == -1: | |
# No speakerfound for this text, use default terminal color | |
result.append(text) | |
else: | |
result.append(f"[{colors[speaker]}]{text}") | |
return "\n".join(result) | |
@contextmanager | |
def suppress_stdout(): | |
# Auxiliary function to suppress Whisper logs (it is quite verbose) | |
# All credit goes to: https://thesmithfam.org/blog/2012/10/25/temporarily-suppress-console-output-in-python/ | |
with open(os.devnull, "w") as devnull: | |
old_stdout = sys.stdout | |
sys.stdout = devnull | |
try: | |
yield | |
finally: | |
sys.stdout = old_stdout | |
class WhisperTranscriber: | |
def __init__(self, model="small", device=None): | |
self.model = whisper.load_model(model, device=device) | |
self._buffer = "" | |
def transcribe(self, waveform): | |
"""Transcribe audio using Whisper""" | |
# Pad/trim audio to fit 30 seconds as required by Whisper | |
audio = waveform.data.astype("float32").reshape(-1) | |
audio = whisper.pad_or_trim(audio) | |
# Transcribe the given audio while suppressing logs | |
with suppress_stdout(): | |
transcription = whisper.transcribe( | |
self.model, | |
audio, | |
# We use past transcriptions to condition the model | |
initial_prompt=self._buffer, | |
verbose=True # to avoid progress bar | |
) | |
return transcription | |
def identify_speakers(self, transcription, diarization, time_shift): | |
"""Iterate over transcription segments to assign speakers""" | |
speaker_captions = [] | |
for segment in transcription["segments"]: | |
# Crop diarization to the segment timestamps | |
start = time_shift + segment["words"][0]["start"] | |
end = time_shift + segment["words"][-1]["end"] | |
dia = diarization.crop(Segment(start, end)) | |
# Assign a speaker to the segment based on diarization | |
speakers = dia.labels() | |
num_speakers = len(speakers) | |
if num_speakers == 0: | |
# No speakers were detected | |
caption = (-1, segment["text"]) | |
elif num_speakers == 1: | |
# Only one speaker is active in this segment | |
spk_id = int(speakers[0].split("speaker")[1]) | |
caption = (spk_id, segment["text"]) | |
else: | |
# Multiple speakers, select the one that speaks the most | |
max_speaker = int(np.argmax([ | |
dia.label_duration(spk) for spk in speakers | |
])) | |
caption = (max_speaker, segment["text"]) | |
speaker_captions.append(caption) | |
return speaker_captions | |
def __call__(self, diarization, waveform): | |
# Step 1: Transcribe | |
transcription = self.transcribe(waveform) | |
# Update transcription buffer | |
self._buffer += transcription["text"] | |
# The audio may not be the beginning of the conversation | |
time_shift = waveform.sliding_window.start | |
# Step 2: Assign speakers | |
speaker_transcriptions = self.identify_speakers(transcription, diarization, time_shift) | |
return speaker_transcriptions | |
# Suppress whisper-timestamped warnings for a clean output | |
logging.getLogger("whisper_timestamped").setLevel(logging.ERROR) | |
# If you have a GPU, you can also set device=torch.device("cuda") | |
config = SpeakerDiarizationConfig( | |
duration=5, | |
step=0.5, | |
latency="min", | |
tau_active=0.5, | |
rho_update=0.1, | |
delta_new=0.57 | |
) | |
dia = SpeakerDiarization(config) | |
source = MicrophoneAudioSource(config.sample_rate) | |
# If you have a GPU, you can also set device="cuda" | |
asr = WhisperTranscriber(model="small") | |
# Split the stream into 2s chunks for transcription | |
transcription_duration = 2 | |
# Apply models in batches for better efficiency | |
batch_size = int(transcription_duration // config.step) | |
# Chain of operations to apply on the stream of microphone audio | |
source.stream.pipe( | |
# Format audio stream to sliding windows of 5s with a step of 500ms | |
dops.rearrange_audio_stream( | |
config.duration, config.step, config.sample_rate | |
), | |
# Wait until a batch is full | |
# The output is a list of audio chunks | |
ops.buffer_with_count(count=batch_size), | |
# Obtain diarization prediction | |
# The output is a list of pairs `(diarization, audio chunk)` | |
ops.map(dia), | |
# Concatenate 500ms predictions/chunks to form a single 2s chunk | |
ops.map(concat), | |
# Ignore this chunk if it does not contain speech | |
ops.filter(lambda ann_wav: ann_wav[0].get_timeline().duration() > 0), | |
# Obtain speaker-aware transcriptions | |
# The output is a list of pairs `(speaker: int, caption: str)` | |
ops.starmap(asr), | |
# Color transcriptions according to the speaker | |
# The output is plain text with color references for rich | |
ops.map(colorize_transcription), | |
).subscribe( | |
on_next=rich.print, # print colored text | |
on_error=lambda _: traceback.print_exc() # print stacktrace if error | |
) | |
print("Listening...") | |
source.read() |
added code to the script to log into hugging face:
from huggingface_hub import login
login(token=r4a_config['hugging_face_key']) # use key parsed from json config file
You have to provide your contact info:
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Could not download 'pyannote/segmentation' model.
It might be because the model is private or gated so make
sure to authenticate. Visit https://hf.co/settings/tokens to
create your access token and retry with:
>>> Model.from_pretrained('pyannote/segmentation',
... use_auth_token=YOUR_AUTH_TOKEN)
If this still does not work, it might be because the model is gated:
visit https://hf.co/pyannote/segmentation to accept the user conditions.
Could not download 'pyannote/segmentation' model.
It might be because the model is private or gated so make
sure to authenticate. Visit https://hf.co/settings/tokens to
create your access token and retry with:
>>> Model.from_pretrained('pyannote/segmentation',
... use_auth_token=YOUR_AUTH_TOKEN)
If this still does not work, it might be because the model is gated:
visit https://hf.co/pyannote/segmentation to accept the user conditions.
Traceback (most recent call last):
File "/usr/app/src/./diart_whisper.py", line 210, in <module>
dia = SpeakerDiarization(config)
File "/usr/local/lib/python3.10/dist-packages/diart/blocks/diarization.py", line 96, in __init__
self.segmentation = SpeakerSegmentation(
File "/usr/local/lib/python3.10/dist-packages/diart/blocks/segmentation.py", line 17, in __init__
self.model.to(self.device)
File "/usr/local/lib/python3.10/dist-packages/diart/models.py", line 128, in to
self.model = self.model.to(device)
AttributeError: 'NoneType' object has no attribute 'to'
And the same for trhe embedded model:
Could not download 'pyannote/embedding' model.
It might be because the model is private or gated so make
sure to authenticate. Visit https://hf.co/settings/tokens to
create your access token and retry with:
>>> Model.from_pretrained('pyannote/embedding',
... use_auth_token=YOUR_AUTH_TOKEN)
If this still does not work, it might be because the model is gated:
visit https://hf.co/pyannote/embedding to accept the user conditions.
Could not download 'pyannote/embedding' model.
It might be because the model is private or gated so make
sure to authenticate. Visit https://hf.co/settings/tokens to
create your access token and retry with:
>>> Model.from_pretrained('pyannote/embedding',
... use_auth_token=YOUR_AUTH_TOKEN)
If this still does not work, it might be because the model is gated:
visit https://hf.co/pyannote/embedding to accept the user conditions.
Now to debug why there is no transcript, only the listening prompt.
I am encountering an error with the hf_token:
Traceback (most recent call last): File "/usr/app/src/./diart_whisper.py", line 206, in <module> dia = SpeakerDiarization(config) File "/usr/local/lib/python3.10/dist-packages/diart/blocks/diarization.py", line 96, in __init__ self.segmentation = SpeakerSegmentation( File "/usr/local/lib/python3.10/dist-packages/diart/blocks/segmentation.py", line 13, in __init__ self.model.eval() File "/usr/local/lib/python3.10/dist-packages/diart/models.py", line 136, in eval self.load() File "/usr/local/lib/python3.10/dist-packages/diart/models.py", line 124, in load self.model = self.get_model() File "/usr/local/lib/python3.10/dist-packages/diart/models.py", line 50, in __call__ model = Model.from_pretrained(self.model_info, use_auth_token=self.hf_token) File "/usr/local/lib/python3.10/dist-packages/pyannote/audio/core/model.py", line 624, in from_pretrained path_for_pl = hf_hub_download( File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py", line 118, in _inner_fn return fn(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py", line 1223, in hf_hub_download headers = build_hf_headers( File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py", line 118, in _inner_fn return fn(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_headers.py", line 121, in build_hf_headers token_to_send = get_token_to_send(token) File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_headers.py", line 153, in get_token_to_send raise LocalTokenNotFoundError( huggingface_hub.utils._headers.LocalTokenNotFoundError: Token is required (`token=True`), but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.
I don't see a way to pass as a parameter in the gist code, since SpeakerDiarizationConfig and SpeakerDiarization don't have a param for the token.
Hi @juanmc2005 , I followed all your steps but whenever i run the script its just stuck on listening... here are the output from console.
(diart) PS C:\Users\User\Desktop\whisper> python main.py C:\Users\User\miniconda3\envs\diart\lib\site-packages\pyannote\audio\core\io.py:43: UserWarning: torchaudio._backend.set_audio_backend has been deprecated. With dispatcher enabled, this function is no-op. You can remove the function call. torchaudio.set_audio_backend("soundfile") C:\Users\User\miniconda3\envs\diart\lib\site-packages\torch_audiomentations\utils\io.py:27: UserWarning: torchaudio._backend.set_audio_backend has been deprecated. With dispatcher enabled, this function is no-op. You can remove the function call. torchaudio.set_audio_backend("soundfile") The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows. The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows. Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.2. To apply the upgrade to your files permanently, run
python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\pyannote\models--pyannote--segmentation\snapshots\2ffce0501d0aecad81b43a06d538186e292d0070\pytorch_model.bin
Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x. Model was trained with torch 1.10.0+cu102, yours is 2.1.1+cpu. Bad things might happen unless you revert torch to 1.x. Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.1.2. To apply the upgrade to your files permanently, runpython -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\pyannote\models--pyannote--embedding\snapshots\c6335d8f1cd77b30084387468a6cf26fea90009b\pytorch_model.bin
Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x. Model was trained with torch 1.8.1+cu102, yours is 2.1.1+cpu. Bad things might happen unless you revert torch to 1.x. Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.1.2. To apply the upgrade to your files permanently, runpython -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\pyannote\models--pyannote--embedding\snapshots\c6335d8f1cd77b30084387468a6cf26fea90009b\pytorch_model.bin
Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x. Model was trained with torch 1.8.1+cu102, yours is 2.1.1+cpu. Bad things might happen unless you revert torch to 1.x. Listening...
same here i'm working on M2 pro
Concerning the issue with the huggingface token, detailed instructions can be found in the README to include the token automatically. Otherwise they can be passed when loading the models with SegmentationModel.from_pretrained("pyannote/segmentation", hf_token=...)
(same for EmbeddingModel
).
Concerning the script getting stuck on "listening", I suggest you try to debug line by line to see where it hangs (if it even does so). As I said in other discussions, this could come from different places. You can check out this issue for context and my previous answers.
Hi @juanmc2005
I got these error when running it.
Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.
Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.
OMP: Error #15: Initializing libiomp5.dylib, but found libiomp5.dylib already initialized.
OMP: Hint This means that multiple copies of the OpenMP runtime have been linked into the program. That is dangerous, since it can degrade performance or cause incorrect results. The best thing to do is to ensure that only a single OpenMP runtime is linked into the process, e.g. by avoiding static linking of the OpenMP runtime in any library. As an unsafe, unsupported, undocumented workaround you can set the environment variable KMP_DUPLICATE_LIB_OK=TRUE to allow the program to continue to execute, but that may cause crashes or silently produce incorrect results. For more information, please see http://www.intel.com/software/products/support/.
I am on a Mac M1 Pro
- Does this code to realtime transcription with speaker labels?
- the gist is stuck at "listening..."
I can fix the listening halt on Ubuntu and Mac OS with M2 Ultra by just changing the following line (line 151):
source = MicrophoneAudioSource(config.sample_rate)
to this:
source = MicrophoneAudioSource(config.step)
Hi @juanmc2005,
First of all, thank you for your article on Medium and for your code.
I knew nothing about Pyannote, Diart or Whisper(/-timestamped) yesterday, and now, I am able to make them work independently in great part thanks to your documentation on Diart.
When it comes to running the current script, I have one last small problem.
I have audio that is interpreted correctly by both Whisper-Timestamped and Diart when I pass it through their respective example codes.
However, when using the current script, the transcription of the audio is completely wrong.
If I pass the audio directly (without splitting it first) to whisper.transcribe, the transcription is good, but the whole text is returned for every timestamp, which is expected now that the audio is not split.
I don't know if this is a problem that comes from some settings I haven't made properly in your script, or if this is a problem related to Whisper.
If you have any clues on the subject, I would be interested to ear about it.
If the problem comes from Whisper, I will ask in their repository.
Thank you in advance and take care.
I am encountering an error with the hf_token:
I don't see a way to pass as a parameter in the gist code, since SpeakerDiarizationConfig and SpeakerDiarization don't have a param for the token.