Skip to content

Instantly share code, notes, and snippets.

@victormurcia
Last active October 22, 2024 17:21
Show Gist options
  • Save victormurcia/96e152244cbdd6c6fc0a7d27532d6f02 to your computer and use it in GitHub Desktop.
Save victormurcia/96e152244cbdd6c6fc0a7d27532d6f02 to your computer and use it in GitHub Desktop.
def diarize_audio(audio_path, method='nmf', min_segment_duration=1.0, max_speakers=8, min_speakers=1):
"""
Diarization workflow with automatic speaker count estimation.
Parameters:
-----------
audio_path : str
Path to the audio file
method : str
Diarization method ('nmf' or 'ica')
min_segment_duration : float
Minimum duration for a speaker segment in seconds
max_speakers : int
Maximum number of speakers to consider
min_speakers : int
Minimum number of speakers to consider
Returns:
--------
pandas.DataFrame
DataFrame containing speaker segments with timing information
"""
print("Preprocessing audio...")
y, sr = preprocess_audio(audio_path)
print("Extracting features...")
features, timestamps, hop_length = extract_features(y, sr)
print("Estimating number of speakers...")
n_speakers = estimate_n_speakers(features, method, max_speakers, min_speakers)
print(f"Estimated number of speakers: {n_speakers}")
print(f"Applying source separation using {method.upper()}...")
W, H = apply_source_separation(features, n_speakers, method)
print("Assigning speakers...")
labels, confidence_scores = assign_speakers(W, timestamps, min_segment_duration)
# Create time frames array for proper indexing
frame_times = np.arange(len(labels)) * (timestamps[1] - timestamps[0])
# Create segments DataFrame
segments = []
current_speaker = labels[0]
segment_start = frame_times[0]
for i in range(1, len(labels)):
if labels[i] != current_speaker:
# Find indices for the current segment
segment_mask = (frame_times >= segment_start) & (frame_times < frame_times[i])
segments.append({
'start': segment_start,
'end': frame_times[i],
'speaker': f"SPEAKER_{current_speaker}",
'duration': frame_times[i] - segment_start,
'confidence': np.mean(confidence_scores[segment_mask])
})
segment_start = frame_times[i]
current_speaker = labels[i]
# Add final segment
if len(segments) > 0:
segment_mask = frame_times >= segment_start
segments.append({
'start': segment_start,
'end': frame_times[-1],
'speaker': f"SPEAKER_{current_speaker}",
'duration': frame_times[-1] - segment_start,
'confidence': np.mean(confidence_scores[segment_mask])
})
return pd.DataFrame(segments)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment