import sounddevice as sd
import numpy as np
import pynput.keyboard
import time
import whisper
import tempfile
import os
from scipy.io.wavfile import write

# Load Whisper Model
model = whisper.load_model("base")
keyboard = pynput.keyboard.Controller()

# Allowed Languages
Allowed_Languages = ['en']
# Silence Detection Parameters
SILENCE_THRESHOLD = 75  # Adjust based on your microphone sensitivity
SILENCE_DURATION = 1   # Seconds of silence required to stop recording and transcribe
MINIMUM_DURATION = 3.0   # Minimum duration of audio to transcribe

def is_silent(data, threshold=SILENCE_THRESHOLD):
    # Check if the average amplitude is below the threshold
    v = np.mean(np.abs(data))
    return v < threshold

def transcribe_and_type(audio_buffer, samplerate=16000):
    # Save the audio buffer to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
        temp_audio_file_path = temp_audio_file.name
        # Write the audio buffer to the temporary file using scipy
        write(temp_audio_file_path, samplerate, audio_buffer)

    # Load and transcribe the audio using Whisper
    audio = whisper.load_audio(temp_audio_file_path)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(model.device).float()  # Ensure float32

    _, probs = model.detect_language(mel)
    if max(probs, key=probs.get) not in Allowed_Languages:
        os.remove(temp_audio_file_path)
        return None
    options = whisper.DecodingOptions(fp16=False)  # Disable half-precision
    result = whisper.decode(model, mel, options)

    # Print the recognized text
    print(result.text)

    # Simulate typing the text
    # for char in result.text:
    #     keyboard.type(char)

    # Clean up the temporary file
    os.remove(temp_audio_file_path)

def continuous_recording(samplerate=16000):
    print("Listening...")
    audio_buffer = []
    recording = False
    silence_start_time = None

    with sd.InputStream(samplerate=samplerate, channels=1, dtype='int16') as stream:
        while True:
            # Read small chunks of audio continuously
            data, _ = stream.read(1024)
            audio_buffer.extend(data.flatten())

            # Check for silence
            if is_silent(data):
                if recording:
                    if silence_start_time is None:
                        silence_start_time = time.time()
                    elif time.time() - silence_start_time >= SILENCE_DURATION:
                        print("duration", time.time() - recording_start_time)
                        if time.time() - recording_start_time > MINIMUM_DURATION:
                            # Stop recording and transcribe
                            transcribe_and_type(np.array(audio_buffer), samplerate)
                            audio_buffer.clear()
                            recording = False
                            silence_start_time = None
                            recording_start_time = None
            else:
                silence_start_time = None  # Reset if voice is detected
                if not recording:
                    # Start recording when sound is detected
                    recording = True
                    recording_start_time = time.time()

if __name__ == "__main__":
    continuous_recording()