Skip to content

Instantly share code, notes, and snippets.

@sam2332
Created October 18, 2024 20:23
Show Gist options
  • Save sam2332/107e4f3052157d852603a1247885a120 to your computer and use it in GitHub Desktop.
Save sam2332/107e4f3052157d852603a1247885a120 to your computer and use it in GitHub Desktop.
Whisper based notes taking software
import sounddevice as sd
import numpy as np
import pynput.keyboard
import time
import whisper
import tempfile
import os
from scipy.io.wavfile import write
# Load Whisper Model
model = whisper.load_model("base")
keyboard = pynput.keyboard.Controller()
# Allowed Languages
Allowed_Languages = ['en']
# Silence Detection Parameters
SILENCE_THRESHOLD = 75 # Adjust based on your microphone sensitivity
SILENCE_DURATION = 1 # Seconds of silence required to stop recording and transcribe
MINIMUM_DURATION = 3.0 # Minimum duration of audio to transcribe
def is_silent(data, threshold=SILENCE_THRESHOLD):
# Check if the average amplitude is below the threshold
v = np.mean(np.abs(data))
return v < threshold
def transcribe_and_type(audio_buffer, samplerate=16000):
# Save the audio buffer to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
temp_audio_file_path = temp_audio_file.name
# Write the audio buffer to the temporary file using scipy
write(temp_audio_file_path, samplerate, audio_buffer)
# Load and transcribe the audio using Whisper
audio = whisper.load_audio(temp_audio_file_path)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device).float() # Ensure float32
_, probs = model.detect_language(mel)
if max(probs, key=probs.get) not in Allowed_Languages:
os.remove(temp_audio_file_path)
return None
options = whisper.DecodingOptions(fp16=False) # Disable half-precision
result = whisper.decode(model, mel, options)
# Print the recognized text
print(result.text)
# Simulate typing the text
# for char in result.text:
# keyboard.type(char)
# Clean up the temporary file
os.remove(temp_audio_file_path)
def continuous_recording(samplerate=16000):
print("Listening...")
audio_buffer = []
recording = False
silence_start_time = None
with sd.InputStream(samplerate=samplerate, channels=1, dtype='int16') as stream:
while True:
# Read small chunks of audio continuously
data, _ = stream.read(1024)
audio_buffer.extend(data.flatten())
# Check for silence
if is_silent(data):
if recording:
if silence_start_time is None:
silence_start_time = time.time()
elif time.time() - silence_start_time >= SILENCE_DURATION:
print("duration", time.time() - recording_start_time)
if time.time() - recording_start_time > MINIMUM_DURATION:
# Stop recording and transcribe
transcribe_and_type(np.array(audio_buffer), samplerate)
audio_buffer.clear()
recording = False
silence_start_time = None
recording_start_time = None
else:
silence_start_time = None # Reset if voice is detected
if not recording:
# Start recording when sound is detected
recording = True
recording_start_time = time.time()
if __name__ == "__main__":
continuous_recording()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment