import sounddevice as sd import numpy as np import pynput.keyboard import time import whisper import tempfile import os from scipy.io.wavfile import write # Load Whisper Model model = whisper.load_model("base") keyboard = pynput.keyboard.Controller() # Allowed Languages Allowed_Languages = ['en'] # Silence Detection Parameters SILENCE_THRESHOLD = 75 # Adjust based on your microphone sensitivity SILENCE_DURATION = 1 # Seconds of silence required to stop recording and transcribe MINIMUM_DURATION = 3.0 # Minimum duration of audio to transcribe def is_silent(data, threshold=SILENCE_THRESHOLD): # Check if the average amplitude is below the threshold v = np.mean(np.abs(data)) return v < threshold def transcribe_and_type(audio_buffer, samplerate=16000): # Save the audio buffer to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file: temp_audio_file_path = temp_audio_file.name # Write the audio buffer to the temporary file using scipy write(temp_audio_file_path, samplerate, audio_buffer) # Load and transcribe the audio using Whisper audio = whisper.load_audio(temp_audio_file_path) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device).float() # Ensure float32 _, probs = model.detect_language(mel) if max(probs, key=probs.get) not in Allowed_Languages: os.remove(temp_audio_file_path) return None options = whisper.DecodingOptions(fp16=False) # Disable half-precision result = whisper.decode(model, mel, options) # Print the recognized text print(result.text) # Simulate typing the text # for char in result.text: # keyboard.type(char) # Clean up the temporary file os.remove(temp_audio_file_path) def continuous_recording(samplerate=16000): print("Listening...") audio_buffer = [] recording = False silence_start_time = None with sd.InputStream(samplerate=samplerate, channels=1, dtype='int16') as stream: while True: # Read small chunks of audio continuously data, _ = stream.read(1024) audio_buffer.extend(data.flatten()) # Check for silence if is_silent(data): if recording: if silence_start_time is None: silence_start_time = time.time() elif time.time() - silence_start_time >= SILENCE_DURATION: print("duration", time.time() - recording_start_time) if time.time() - recording_start_time > MINIMUM_DURATION: # Stop recording and transcribe transcribe_and_type(np.array(audio_buffer), samplerate) audio_buffer.clear() recording = False silence_start_time = None recording_start_time = None else: silence_start_time = None # Reset if voice is detected if not recording: # Start recording when sound is detected recording = True recording_start_time = time.time() if __name__ == "__main__": continuous_recording()