sam2332 · October 18, 2024 20:23
diff --git a/WhisperTrascribe.py b/WhisperTrascribe.py
 import sounddevice as sd
 import numpy as np
 import pynput.keyboard
 import time
 import whisper
 import tempfile
 import os
 from scipy.io.wavfile import write

 # Load Whisper Model
 model = whisper.load_model("base")
 keyboard = pynput.keyboard.Controller()

 # Allowed Languages
 Allowed_Languages = ['en']
 # Silence Detection Parameters
 SILENCE_THRESHOLD = 75  # Adjust based on your microphone sensitivity
 SILENCE_DURATION = 1   # Seconds of silence required to stop recording and transcribe
 MINIMUM_DURATION = 3.0   # Minimum duration of audio to transcribe

 def is_silent(data, threshold=SILENCE_THRESHOLD):
    # Check if the average amplitude is below the threshold
    v = np.mean(np.abs(data))
    return v < threshold

 def transcribe_and_type(audio_buffer, samplerate=16000):
    # Save the audio buffer to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
        temp_audio_file_path = temp_audio_file.name
        # Write the audio buffer to the temporary file using scipy
        write(temp_audio_file_path, samplerate, audio_buffer)

    # Load and transcribe the audio using Whisper
    audio = whisper.load_audio(temp_audio_file_path)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(model.device).float()  # Ensure float32

    _, probs = model.detect_language(mel)
    if max(probs, key=probs.get) not in Allowed_Languages:
        os.remove(temp_audio_file_path)
        return None
    options = whisper.DecodingOptions(fp16=False)  # Disable half-precision
    result = whisper.decode(model, mel, options)

    # Print the recognized text
    print(result.text)

    # Simulate typing the text
    # for char in result.text:
    #     keyboard.type(char)

    # Clean up the temporary file
    os.remove(temp_audio_file_path)

 def continuous_recording(samplerate=16000):
    print("Listening...")
    audio_buffer = []
    recording = False
    silence_start_time = None

    with sd.InputStream(samplerate=samplerate, channels=1, dtype='int16') as stream:
        while True:
            # Read small chunks of audio continuously
            data, _ = stream.read(1024)
            audio_buffer.extend(data.flatten())

            # Check for silence
            if is_silent(data):
                if recording:
                    if silence_start_time is None:
                        silence_start_time = time.time()
                    elif time.time() - silence_start_time >= SILENCE_DURATION:
                        print("duration", time.time() - recording_start_time)
                        if time.time() - recording_start_time > MINIMUM_DURATION:
                            # Stop recording and transcribe
                            transcribe_and_type(np.array(audio_buffer), samplerate)
                            audio_buffer.clear()
                            recording = False
                            silence_start_time = None
                            recording_start_time = None
            else:
                silence_start_time = None  # Reset if voice is detected
                if not recording:
                    # Start recording when sound is detected
                    recording = True
                    recording_start_time = time.time()

 if __name__ == "__main__":
    continuous_recording()
	import sounddevice as sd
	import numpy as np
	import pynput.keyboard
	import time
	import whisper
	import tempfile
	import os
	from scipy.io.wavfile import write

	# Load Whisper Model
	model = whisper.load_model("base")
	keyboard = pynput.keyboard.Controller()

	# Allowed Languages
	Allowed_Languages = ['en']
	# Silence Detection Parameters
	SILENCE_THRESHOLD = 75 # Adjust based on your microphone sensitivity
	SILENCE_DURATION = 1 # Seconds of silence required to stop recording and transcribe
	MINIMUM_DURATION = 3.0 # Minimum duration of audio to transcribe

	def is_silent(data, threshold=SILENCE_THRESHOLD):
	# Check if the average amplitude is below the threshold
	v = np.mean(np.abs(data))
	return v < threshold

	def transcribe_and_type(audio_buffer, samplerate=16000):
	# Save the audio buffer to a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
	temp_audio_file_path = temp_audio_file.name
	# Write the audio buffer to the temporary file using scipy
	write(temp_audio_file_path, samplerate, audio_buffer)

	# Load and transcribe the audio using Whisper
	audio = whisper.load_audio(temp_audio_file_path)
	audio = whisper.pad_or_trim(audio)
	mel = whisper.log_mel_spectrogram(audio).to(model.device).float() # Ensure float32

	_, probs = model.detect_language(mel)
	if max(probs, key=probs.get) not in Allowed_Languages:
	os.remove(temp_audio_file_path)
	return None
	options = whisper.DecodingOptions(fp16=False) # Disable half-precision
	result = whisper.decode(model, mel, options)

	# Print the recognized text
	print(result.text)

	# Simulate typing the text
	# for char in result.text:
	# keyboard.type(char)

	# Clean up the temporary file
	os.remove(temp_audio_file_path)

	def continuous_recording(samplerate=16000):
	print("Listening...")
	audio_buffer = []
	recording = False
	silence_start_time = None

	with sd.InputStream(samplerate=samplerate, channels=1, dtype='int16') as stream:
	while True:
	# Read small chunks of audio continuously
	data, _ = stream.read(1024)
	audio_buffer.extend(data.flatten())

	# Check for silence
	if is_silent(data):
	if recording:
	if silence_start_time is None:
	silence_start_time = time.time()
	elif time.time() - silence_start_time >= SILENCE_DURATION:
	print("duration", time.time() - recording_start_time)
	if time.time() - recording_start_time > MINIMUM_DURATION:
	# Stop recording and transcribe
	transcribe_and_type(np.array(audio_buffer), samplerate)
	audio_buffer.clear()
	recording = False
	silence_start_time = None
	recording_start_time = None
	else:
	silence_start_time = None # Reset if voice is detected
	if not recording:
	# Start recording when sound is detected
	recording = True
	recording_start_time = time.time()

	if __name__ == "__main__":
	continuous_recording()