Created
October 18, 2024 20:23
-
-
Save sam2332/107e4f3052157d852603a1247885a120 to your computer and use it in GitHub Desktop.
Whisper based notes taking software
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sounddevice as sd | |
import numpy as np | |
import pynput.keyboard | |
import time | |
import whisper | |
import tempfile | |
import os | |
from scipy.io.wavfile import write | |
# Load Whisper Model | |
model = whisper.load_model("base") | |
keyboard = pynput.keyboard.Controller() | |
# Allowed Languages | |
Allowed_Languages = ['en'] | |
# Silence Detection Parameters | |
SILENCE_THRESHOLD = 75 # Adjust based on your microphone sensitivity | |
SILENCE_DURATION = 1 # Seconds of silence required to stop recording and transcribe | |
MINIMUM_DURATION = 3.0 # Minimum duration of audio to transcribe | |
def is_silent(data, threshold=SILENCE_THRESHOLD): | |
# Check if the average amplitude is below the threshold | |
v = np.mean(np.abs(data)) | |
return v < threshold | |
def transcribe_and_type(audio_buffer, samplerate=16000): | |
# Save the audio buffer to a temporary file | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file: | |
temp_audio_file_path = temp_audio_file.name | |
# Write the audio buffer to the temporary file using scipy | |
write(temp_audio_file_path, samplerate, audio_buffer) | |
# Load and transcribe the audio using Whisper | |
audio = whisper.load_audio(temp_audio_file_path) | |
audio = whisper.pad_or_trim(audio) | |
mel = whisper.log_mel_spectrogram(audio).to(model.device).float() # Ensure float32 | |
_, probs = model.detect_language(mel) | |
if max(probs, key=probs.get) not in Allowed_Languages: | |
os.remove(temp_audio_file_path) | |
return None | |
options = whisper.DecodingOptions(fp16=False) # Disable half-precision | |
result = whisper.decode(model, mel, options) | |
# Print the recognized text | |
print(result.text) | |
# Simulate typing the text | |
# for char in result.text: | |
# keyboard.type(char) | |
# Clean up the temporary file | |
os.remove(temp_audio_file_path) | |
def continuous_recording(samplerate=16000): | |
print("Listening...") | |
audio_buffer = [] | |
recording = False | |
silence_start_time = None | |
with sd.InputStream(samplerate=samplerate, channels=1, dtype='int16') as stream: | |
while True: | |
# Read small chunks of audio continuously | |
data, _ = stream.read(1024) | |
audio_buffer.extend(data.flatten()) | |
# Check for silence | |
if is_silent(data): | |
if recording: | |
if silence_start_time is None: | |
silence_start_time = time.time() | |
elif time.time() - silence_start_time >= SILENCE_DURATION: | |
print("duration", time.time() - recording_start_time) | |
if time.time() - recording_start_time > MINIMUM_DURATION: | |
# Stop recording and transcribe | |
transcribe_and_type(np.array(audio_buffer), samplerate) | |
audio_buffer.clear() | |
recording = False | |
silence_start_time = None | |
recording_start_time = None | |
else: | |
silence_start_time = None # Reset if voice is detected | |
if not recording: | |
# Start recording when sound is detected | |
recording = True | |
recording_start_time = time.time() | |
if __name__ == "__main__": | |
continuous_recording() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment