Last active
October 22, 2024 17:21
-
-
Save victormurcia/96e152244cbdd6c6fc0a7d27532d6f02 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def diarize_audio(audio_path, method='nmf', min_segment_duration=1.0, max_speakers=8, min_speakers=1): | |
""" | |
Diarization workflow with automatic speaker count estimation. | |
Parameters: | |
----------- | |
audio_path : str | |
Path to the audio file | |
method : str | |
Diarization method ('nmf' or 'ica') | |
min_segment_duration : float | |
Minimum duration for a speaker segment in seconds | |
max_speakers : int | |
Maximum number of speakers to consider | |
min_speakers : int | |
Minimum number of speakers to consider | |
Returns: | |
-------- | |
pandas.DataFrame | |
DataFrame containing speaker segments with timing information | |
""" | |
print("Preprocessing audio...") | |
y, sr = preprocess_audio(audio_path) | |
print("Extracting features...") | |
features, timestamps, hop_length = extract_features(y, sr) | |
print("Estimating number of speakers...") | |
n_speakers = estimate_n_speakers(features, method, max_speakers, min_speakers) | |
print(f"Estimated number of speakers: {n_speakers}") | |
print(f"Applying source separation using {method.upper()}...") | |
W, H = apply_source_separation(features, n_speakers, method) | |
print("Assigning speakers...") | |
labels, confidence_scores = assign_speakers(W, timestamps, min_segment_duration) | |
# Create time frames array for proper indexing | |
frame_times = np.arange(len(labels)) * (timestamps[1] - timestamps[0]) | |
# Create segments DataFrame | |
segments = [] | |
current_speaker = labels[0] | |
segment_start = frame_times[0] | |
for i in range(1, len(labels)): | |
if labels[i] != current_speaker: | |
# Find indices for the current segment | |
segment_mask = (frame_times >= segment_start) & (frame_times < frame_times[i]) | |
segments.append({ | |
'start': segment_start, | |
'end': frame_times[i], | |
'speaker': f"SPEAKER_{current_speaker}", | |
'duration': frame_times[i] - segment_start, | |
'confidence': np.mean(confidence_scores[segment_mask]) | |
}) | |
segment_start = frame_times[i] | |
current_speaker = labels[i] | |
# Add final segment | |
if len(segments) > 0: | |
segment_mask = frame_times >= segment_start | |
segments.append({ | |
'start': segment_start, | |
'end': frame_times[-1], | |
'speaker': f"SPEAKER_{current_speaker}", | |
'duration': frame_times[-1] - segment_start, | |
'confidence': np.mean(confidence_scores[segment_mask]) | |
}) | |
return pd.DataFrame(segments) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment