Last active
January 17, 2026 21:01
-
-
Save ErikDeBruijn/395c92a03218e709aebe5d55fc33a126 to your computer and use it in GitHub Desktop.
Jarvis Voice Assistant Daemon - Always-on voice daemon for Claude Code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Jarvis Voice Assistant Daemon Configuration | |
| wake_words: ["hey jarvis", "jarvis"] | |
| language: "nl" | |
| llm: | |
| command: "/Users/erik/.claude/local/claude" # Volledige pad naar juiste versie | |
| jarvis_prompt: "/jarvis" # activeer voice mode | |
| extra_args: [] # bijv. ["--model", "opus"] | |
| tmux_session: "jarvis" # tmux session name | |
| whisper: | |
| url: "http://10.1.1.64:8081/inference" | |
| local_fallback: true | |
| model: "base" # voor lokale fallback | |
| audio: | |
| device_name: "Yeti X" # Expliciet microfoon kiezen (of null voor default) | |
| native_sample_rate: 48000 # Native sample rate van de microfoon | |
| target_sample_rate: 16000 # Sample rate voor VAD/Whisper | |
| channels: 1 | |
| silence_seconds: 1.5 # sec stilte = einde utterance | |
| max_listen_seconds: 30 # max opnametijd | |
| vad_threshold: 0.5 # Silero VAD threshold | |
| session: | |
| idle_timeout_seconds: 60 # Geen input -> sluit sessie | |
| end_phrases: ["tot ziens", "bedankt jarvis", "klaar", "goodbye", "exit"] | |
| tts: | |
| command: "say" | |
| voice: "Xander" | |
| rate: 180 | |
| sounds: | |
| wake: "~/.config/jarvis/sounds/wake.wav" | |
| listening: "~/.config/jarvis/sounds/listening.wav" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Jarvis Voice Assistant Daemon | |
| An always-on voice daemon that orchestrates Claude Code sessions. | |
| Wake word triggers a new conversation, daemon manages the entire session | |
| including follow-up responses. | |
| Usage: | |
| jarvis-daemon # Run in foreground | |
| jarvis-daemon --test-wake # Test wake word detection | |
| jarvis-daemon --test-listen # Test audio capture + VAD | |
| jarvis-daemon --active # Active mode (voice enabled) | |
| jarvis-daemon --passive # Passive mode (monitoring only) | |
| """ | |
| import argparse | |
| import io | |
| import logging | |
| import os | |
| import signal | |
| import subprocess | |
| import sys | |
| import tempfile | |
| import threading | |
| import time | |
| import wave | |
| from dataclasses import dataclass | |
| from enum import Enum, auto | |
| from pathlib import Path | |
| from typing import Optional | |
| import numpy as np | |
| import requests | |
| import sounddevice as sd | |
| import yaml | |
| # Optional imports with graceful fallback | |
| try: | |
| import openwakeword | |
| from openwakeword.model import Model as WakeWordModel | |
| WAKE_WORD_AVAILABLE = True | |
| except ImportError: | |
| WAKE_WORD_AVAILABLE = False | |
| logging.warning("openwakeword not installed, wake word detection disabled") | |
| try: | |
| import torch | |
| SILERO_AVAILABLE = True | |
| except ImportError: | |
| SILERO_AVAILABLE = False | |
| logging.warning("torch not installed, using simple VAD") | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| class DaemonState(Enum): | |
| IDLE = auto() # Waiting for wake word | |
| LISTENING = auto() # Recording user speech | |
| PROCESSING = auto() # Sending to Whisper/Claude | |
| SPEAKING = auto() # TTS playing | |
| FOLLOW_UP = auto() # Waiting for follow-up (no wake word needed) | |
| @dataclass | |
| class Config: | |
| """Configuration loaded from YAML.""" | |
| wake_words: list[str] | |
| language: str | |
| llm_command: str | |
| llm_jarvis_prompt: str | |
| llm_extra_args: list[str] | |
| tmux_session: str | |
| whisper_url: str | |
| whisper_local_fallback: bool | |
| device_name: Optional[str] | |
| native_sample_rate: int | |
| target_sample_rate: int | |
| channels: int | |
| silence_seconds: float | |
| max_listen_seconds: float | |
| vad_threshold: float | |
| idle_timeout_seconds: int | |
| end_phrases: list[str] | |
| tts_command: str | |
| tts_voice: str | |
| tts_rate: int | |
| sound_wake: str | |
| sound_listening: str | |
| @classmethod | |
| def load(cls, path: Path) -> 'Config': | |
| with open(path) as f: | |
| data = yaml.safe_load(f) | |
| return cls( | |
| wake_words=data.get('wake_words', ['hey jarvis', 'jarvis']), | |
| language=data.get('language', 'nl'), | |
| llm_command=data.get('llm', {}).get('command', 'claude'), | |
| llm_jarvis_prompt=data.get('llm', {}).get('jarvis_prompt', '/jarvis'), | |
| llm_extra_args=data.get('llm', {}).get('extra_args', []), | |
| tmux_session=data.get('llm', {}).get('tmux_session', 'jarvis'), | |
| whisper_url=data.get('whisper', {}).get('url', 'http://10.1.1.64:8081/inference'), | |
| whisper_local_fallback=data.get('whisper', {}).get('local_fallback', True), | |
| device_name=data.get('audio', {}).get('device_name'), | |
| native_sample_rate=data.get('audio', {}).get('native_sample_rate', 48000), | |
| target_sample_rate=data.get('audio', {}).get('target_sample_rate', 16000), | |
| channels=data.get('audio', {}).get('channels', 1), | |
| silence_seconds=data.get('audio', {}).get('silence_seconds', 1.5), | |
| max_listen_seconds=data.get('audio', {}).get('max_listen_seconds', 30), | |
| vad_threshold=data.get('audio', {}).get('vad_threshold', 0.5), | |
| idle_timeout_seconds=data.get('session', {}).get('idle_timeout_seconds', 60), | |
| end_phrases=data.get('session', {}).get('end_phrases', ['tot ziens', 'bedankt jarvis', 'klaar']), | |
| tts_command=data.get('tts', {}).get('command', 'say'), | |
| tts_voice=data.get('tts', {}).get('voice', 'Xander'), | |
| tts_rate=data.get('tts', {}).get('rate', 180), | |
| sound_wake=os.path.expanduser(data.get('sounds', {}).get('wake', '~/.config/jarvis/sounds/wake.wav')), | |
| sound_listening=os.path.expanduser(data.get('sounds', {}).get('listening', '~/.config/jarvis/sounds/listening.wav')), | |
| ) | |
| class AudioCapture: | |
| """Handles audio capture with VAD (Voice Activity Detection).""" | |
| # Silero VAD requires exactly 512 samples at 16kHz (32ms) | |
| VAD_CHUNK_SAMPLES = 512 | |
| def __init__(self, config: Config): | |
| self.config = config | |
| self.vad_model = None | |
| self.device_index = self._find_device() | |
| self._load_vad() | |
| def _find_device(self) -> Optional[int]: | |
| """Find audio device by name.""" | |
| if not self.config.device_name: | |
| logger.info("Using default audio input device") | |
| return None | |
| devices = sd.query_devices() | |
| for i, d in enumerate(devices): | |
| if self.config.device_name in d['name'] and d['max_input_channels'] > 0: | |
| logger.info(f"Using audio device [{i}]: {d['name']}") | |
| return i | |
| logger.warning(f"Device '{self.config.device_name}' not found, using default") | |
| return None | |
| def _resample(self, audio: np.ndarray) -> np.ndarray: | |
| """Resample audio from native to target sample rate.""" | |
| if self.config.native_sample_rate == self.config.target_sample_rate: | |
| return audio | |
| from scipy import signal | |
| target_length = int(len(audio) * self.config.target_sample_rate / self.config.native_sample_rate) | |
| return signal.resample(audio, target_length).astype(np.float32) | |
| def _load_vad(self): | |
| """Load Silero VAD model.""" | |
| if SILERO_AVAILABLE: | |
| try: | |
| self.vad_model, utils = torch.hub.load( | |
| repo_or_dir='snakers4/silero-vad', | |
| model='silero_vad', | |
| force_reload=False, | |
| trust_repo=True | |
| ) | |
| self.get_speech_timestamps = utils[0] | |
| logger.info("Silero VAD loaded successfully") | |
| except Exception as e: | |
| logger.warning(f"Failed to load Silero VAD: {e}") | |
| self.vad_model = None | |
| def is_speech(self, audio_chunk_16k: np.ndarray) -> bool: | |
| """Check if audio chunk contains speech using VAD. | |
| Args: | |
| audio_chunk_16k: Audio chunk already at 16kHz sample rate | |
| """ | |
| if self.vad_model is None: | |
| # Fallback: simple energy-based detection | |
| energy = np.sqrt(np.mean(audio_chunk_16k ** 2)) | |
| return energy > 0.01 | |
| try: | |
| # Silero VAD expects exactly 512 samples at 16kHz | |
| # Process in 512-sample windows and return True if any has speech | |
| chunk_size = self.VAD_CHUNK_SAMPLES | |
| has_speech = False | |
| for i in range(0, len(audio_chunk_16k) - chunk_size + 1, chunk_size): | |
| window = audio_chunk_16k[i:i + chunk_size] | |
| audio_tensor = torch.from_numpy(window.astype(np.float32)) | |
| speech_prob = self.vad_model(audio_tensor, 16000).item() | |
| if speech_prob > self.config.vad_threshold: | |
| has_speech = True | |
| break | |
| return has_speech | |
| except Exception as e: | |
| logger.warning(f"VAD error: {e}") | |
| # Fallback to energy-based | |
| energy = np.sqrt(np.mean(audio_chunk_16k ** 2)) | |
| return energy > 0.01 | |
| def record_until_silence(self, max_duration: Optional[float] = None, min_duration: float = 0.5) -> np.ndarray: | |
| """Record audio until silence is detected. | |
| Records at native sample rate, performs real-time VAD at 16kHz, | |
| returns audio resampled to target sample rate. | |
| Args: | |
| max_duration: Maximum recording duration in seconds | |
| min_duration: Minimum recording duration before silence detection activates | |
| Returns: | |
| Audio array at target_sample_rate (16kHz for Whisper) | |
| """ | |
| max_duration = max_duration or self.config.max_listen_seconds | |
| # Calculate chunk sizes for native and 16kHz | |
| # We want ~32ms chunks (512 samples at 16kHz) | |
| vad_chunk_duration = self.VAD_CHUNK_SAMPLES / 16000 # ~32ms | |
| native_chunk_samples = int(self.config.native_sample_rate * vad_chunk_duration) | |
| silence_chunks_needed = int(self.config.silence_seconds / vad_chunk_duration) | |
| recorded_chunks = [] | |
| silence_count = 0 | |
| total_duration = 0 | |
| speech_started = False | |
| speech_chunk_count = 0 | |
| # Reset VAD state before recording | |
| if self.vad_model is not None: | |
| self.vad_model.reset_states() | |
| logger.info(f"Recording at {self.config.native_sample_rate}Hz... (waiting for speech)") | |
| def audio_callback(indata, frames, time_info, status): | |
| nonlocal silence_count, speech_started, total_duration, speech_chunk_count | |
| if status: | |
| logger.warning(f"Audio status: {status}") | |
| chunk = indata[:, 0].copy() | |
| recorded_chunks.append(chunk) | |
| total_duration += vad_chunk_duration | |
| # Resample chunk to 16kHz for VAD | |
| chunk_16k = self._resample(chunk) if self.config.native_sample_rate != 16000 else chunk | |
| if self.is_speech(chunk_16k): | |
| if not speech_started: | |
| logger.info("Speech detected!") | |
| speech_started = True | |
| speech_chunk_count += 1 | |
| silence_count = 0 | |
| elif speech_started: | |
| silence_count += 1 | |
| with sd.InputStream( | |
| samplerate=self.config.native_sample_rate, | |
| channels=self.config.channels, | |
| dtype=np.float32, | |
| device=self.device_index, | |
| blocksize=native_chunk_samples, | |
| callback=audio_callback | |
| ): | |
| while total_duration < max_duration: | |
| # Only stop on silence if: | |
| # 1. Speech was detected | |
| # 2. We've recorded for at least min_duration | |
| # 3. We've had enough silence chunks | |
| can_stop = ( | |
| speech_started and | |
| total_duration >= min_duration and | |
| silence_count >= silence_chunks_needed | |
| ) | |
| if can_stop: | |
| logger.info(f"Silence detected after {speech_chunk_count} speech chunks, stopping") | |
| break | |
| time.sleep(vad_chunk_duration) | |
| if recorded_chunks: | |
| # Concatenate and resample to target sample rate | |
| audio_native = np.concatenate(recorded_chunks) | |
| audio = self._resample(audio_native) | |
| duration = len(audio) / self.config.target_sample_rate | |
| logger.info(f"Recorded {duration:.1f}s of audio (speech detected: {speech_started})") | |
| return audio | |
| return np.array([], dtype=np.float32) | |
| class WakeWordDetector: | |
| """Detects wake words in audio stream.""" | |
| # OpenWakeWord expects 16kHz audio | |
| WAKE_WORD_SAMPLE_RATE = 16000 | |
| def __init__(self, config: Config): | |
| self.config = config | |
| self.model = None | |
| self.device_index = self._find_device() | |
| self._load_model() | |
| def _find_device(self) -> Optional[int]: | |
| """Find audio device by name.""" | |
| if not self.config.device_name: | |
| return None | |
| devices = sd.query_devices() | |
| for i, d in enumerate(devices): | |
| if self.config.device_name in d['name'] and d['max_input_channels'] > 0: | |
| return i | |
| return None | |
| def _load_model(self): | |
| """Load OpenWakeWord model.""" | |
| if not WAKE_WORD_AVAILABLE: | |
| logger.warning("Wake word detection not available") | |
| return | |
| try: | |
| # Download and load the "hey jarvis" model | |
| openwakeword.utils.download_models(['hey_jarvis_v0.1']) | |
| self.model = WakeWordModel( | |
| wakeword_models=['hey_jarvis_v0.1'], | |
| inference_framework='onnx' | |
| ) | |
| logger.info("Wake word model loaded successfully") | |
| except Exception as e: | |
| logger.error(f"Failed to load wake word model: {e}") | |
| self.model = None | |
| def listen_for_wake_word(self, timeout: Optional[float] = None) -> tuple[bool, np.ndarray]: | |
| """Listen for wake word, return (detected, post_wake_audio). | |
| Returns: | |
| Tuple of (wake_word_detected, audio_buffer) | |
| The audio_buffer contains audio captured AFTER wake word detection, | |
| which should be prepended to the main recording. | |
| """ | |
| if self.model is None: | |
| logger.warning("Wake word model not loaded, triggering immediately") | |
| return True, np.array([], dtype=np.float32) | |
| # OpenWakeWord expects 16kHz, 80ms chunks (1280 samples) | |
| chunk_duration = 0.08 | |
| chunk_samples_16k = int(self.WAKE_WORD_SAMPLE_RATE * chunk_duration) | |
| # If recording at native rate, calculate native chunk size | |
| native_rate = self.config.native_sample_rate | |
| chunk_samples_native = int(native_rate * chunk_duration) | |
| detected = threading.Event() | |
| start_time = time.time() | |
| # Buffer to store audio chunks (native rate) | |
| audio_buffer = [] | |
| wake_word_time = [None] # Use list to allow modification in callback | |
| POST_WAKE_BUFFER_SECONDS = 3.0 # Continue capturing for 3s after wake word | |
| def audio_callback(indata, frames, time_info, status): | |
| if status: | |
| logger.warning(f"Audio status: {status}") | |
| chunk_native = indata[:, 0].copy() | |
| # Always buffer audio (keep last ~5 seconds before wake word) | |
| audio_buffer.append(chunk_native) | |
| max_buffer_chunks = int(5.0 / chunk_duration) | |
| if len(audio_buffer) > max_buffer_chunks and wake_word_time[0] is None: | |
| audio_buffer.pop(0) | |
| # Resample to 16kHz for wake word detection | |
| if native_rate != self.WAKE_WORD_SAMPLE_RATE: | |
| from scipy import signal | |
| target_len = int(len(chunk_native) * self.WAKE_WORD_SAMPLE_RATE / native_rate) | |
| chunk_16k = signal.resample(chunk_native, target_len).astype(np.float32) | |
| else: | |
| chunk_16k = chunk_native | |
| # Convert to int16 for openwakeword | |
| audio_int16 = (chunk_16k * 32767).astype(np.int16) | |
| prediction = self.model.predict(audio_int16) | |
| # Check all wake word scores | |
| for model_name, score in prediction.items(): | |
| if score > 0.5 and wake_word_time[0] is None: | |
| logger.info(f"Wake word detected: {model_name} (score: {score:.2f})") | |
| wake_word_time[0] = time.time() | |
| detected.set() | |
| try: | |
| with sd.InputStream( | |
| samplerate=native_rate, | |
| channels=self.config.channels, | |
| dtype=np.float32, | |
| device=self.device_index, | |
| blocksize=chunk_samples_native, | |
| callback=audio_callback | |
| ): | |
| # Wait for wake word | |
| while not detected.is_set(): | |
| if timeout and (time.time() - start_time) > timeout: | |
| return False, np.array([], dtype=np.float32) | |
| time.sleep(0.05) | |
| # Continue capturing audio after wake word for a bit | |
| # This captures the rest of the user's sentence | |
| logger.info(f"Buffering post-wake-word audio for {POST_WAKE_BUFFER_SECONDS}s...") | |
| post_wake_start = time.time() | |
| while time.time() - post_wake_start < POST_WAKE_BUFFER_SECONDS: | |
| time.sleep(0.05) | |
| except Exception as e: | |
| logger.error(f"Wake word detection error: {e}") | |
| return False, np.array([], dtype=np.float32) | |
| # Return the buffered audio (at native sample rate) | |
| if audio_buffer: | |
| buffered_audio = np.concatenate(audio_buffer) | |
| logger.info(f"Captured {len(buffered_audio) / native_rate:.1f}s of buffered audio") | |
| return True, buffered_audio | |
| return True, np.array([], dtype=np.float32) | |
| class WhisperClient: | |
| """Transcribe audio using Whisper server or local fallback.""" | |
| def __init__(self, config: Config): | |
| self.config = config | |
| self.local_model = None | |
| def transcribe(self, audio: np.ndarray) -> str: | |
| """Transcribe audio to text.""" | |
| # Try remote Whisper server first | |
| try: | |
| return self._transcribe_remote(audio) | |
| except Exception as e: | |
| logger.warning(f"Remote Whisper failed: {e}") | |
| # Fallback to local if enabled | |
| if self.config.whisper_local_fallback: | |
| return self._transcribe_local(audio) | |
| return "" | |
| def _transcribe_remote(self, audio: np.ndarray) -> str: | |
| """Transcribe using remote Whisper server.""" | |
| # Convert to WAV bytes | |
| wav_buffer = io.BytesIO() | |
| with wave.open(wav_buffer, 'wb') as wav_file: | |
| wav_file.setnchannels(self.config.channels) | |
| wav_file.setsampwidth(2) # 16-bit | |
| wav_file.setframerate(self.config.target_sample_rate) | |
| wav_file.writeframes((audio * 32767).astype(np.int16).tobytes()) | |
| wav_buffer.seek(0) | |
| response = requests.post( | |
| self.config.whisper_url, | |
| files={'file': ('audio.wav', wav_buffer, 'audio/wav')}, | |
| data={ | |
| 'language': self.config.language, | |
| 'response_format': 'json' | |
| }, | |
| timeout=30 | |
| ) | |
| response.raise_for_status() | |
| result = response.json() | |
| text = result.get('text', '').strip() | |
| logger.info(f"Transcription: {text}") | |
| return text | |
| def _transcribe_local(self, audio: np.ndarray) -> str: | |
| """Transcribe using local faster-whisper.""" | |
| try: | |
| from faster_whisper import WhisperModel | |
| if self.local_model is None: | |
| logger.info("Loading local Whisper model...") | |
| self.local_model = WhisperModel("base", compute_type="int8") | |
| segments, _ = self.local_model.transcribe( | |
| audio, | |
| language=self.config.language[:2], # 'nl' from 'nl' | |
| beam_size=5 | |
| ) | |
| text = " ".join(segment.text for segment in segments).strip() | |
| logger.info(f"Local transcription: {text}") | |
| return text | |
| except Exception as e: | |
| logger.error(f"Local Whisper failed: {e}") | |
| return "" | |
| class TTS: | |
| """Text-to-Speech using macOS say command.""" | |
| def __init__(self, config: Config): | |
| self.config = config | |
| def speak(self, text: str): | |
| """Speak text using TTS (blocking).""" | |
| if not text: | |
| return | |
| logger.info(f"Speaking: {text[:50]}...") | |
| try: | |
| subprocess.run([ | |
| self.config.tts_command, | |
| '-v', self.config.tts_voice, | |
| '-r', str(self.config.tts_rate), | |
| text | |
| ], check=True) | |
| except subprocess.CalledProcessError as e: | |
| logger.error(f"TTS failed: {e}") | |
| class TmuxSession: | |
| """Manage Claude Code session via tmux.""" | |
| def __init__(self, config: Config): | |
| self.config = config | |
| self.session_name = config.tmux_session | |
| self._last_output_line = 0 | |
| def exists(self) -> bool: | |
| """Check if tmux session exists.""" | |
| result = subprocess.run( | |
| ['tmux', 'has-session', '-t', self.session_name], | |
| capture_output=True | |
| ) | |
| return result.returncode == 0 | |
| def start(self, initial_text: str): | |
| """Start a new Claude session in tmux.""" | |
| if self.exists(): | |
| logger.info(f"Killing existing tmux session: {self.session_name}") | |
| subprocess.run(['tmux', 'kill-session', '-t', self.session_name]) | |
| # Start Claude interactively (no -p flag) so session is resumable | |
| import shlex | |
| cmd_parts = [self.config.llm_command] | |
| cmd_parts.extend(self.config.llm_extra_args) | |
| cmd = shlex.join(cmd_parts) | |
| logger.info(f"Starting tmux session: {self.session_name}") | |
| logger.info(f"Command: {cmd}") | |
| # Start Claude in interactive mode | |
| subprocess.run([ | |
| 'tmux', 'new-session', '-d', '-s', self.session_name, cmd | |
| ], check=True) | |
| self._last_output_line = 0 | |
| time.sleep(2) # Give Claude time to start and show prompt | |
| # Now send the initial prompt via tmux send-keys | |
| prompt_text = f'{self.config.llm_jarvis_prompt} {initial_text}' | |
| logger.info(f"Sending initial prompt: {prompt_text[:80]}...") | |
| self.send_input(prompt_text) | |
| def send_input(self, text: str): | |
| """Send text input to the Claude session.""" | |
| if not self.exists(): | |
| logger.warning("No tmux session to send input to") | |
| return | |
| logger.info(f"Sending to Claude: {text[:50]}...") | |
| subprocess.run([ | |
| 'tmux', 'send-keys', '-t', self.session_name, text, 'Enter' | |
| ], check=True) | |
| def get_new_output(self) -> str: | |
| """Get new output from the session since last check.""" | |
| if not self.exists(): | |
| return "" | |
| result = subprocess.run( | |
| ['tmux', 'capture-pane', '-t', self.session_name, '-p', '-S', '-1000'], | |
| capture_output=True, | |
| text=True | |
| ) | |
| if result.returncode != 0: | |
| return "" | |
| lines = result.stdout.split('\n') | |
| new_lines = lines[self._last_output_line:] | |
| self._last_output_line = len(lines) | |
| return '\n'.join(new_lines).strip() | |
| def is_claude_ready(self) -> bool: | |
| """Check if Claude has finished responding (waiting for input).""" | |
| if not self.exists(): | |
| return False | |
| # Capture the last few lines to check for the prompt | |
| result = subprocess.run( | |
| ['tmux', 'capture-pane', '-t', self.session_name, '-p', '-S', '-5'], | |
| capture_output=True, | |
| text=True | |
| ) | |
| output = result.stdout.strip() | |
| # Claude shows '>' when ready for input | |
| return output.endswith('>') or '>' in output.split('\n')[-1] | |
| def kill(self): | |
| """Kill the tmux session.""" | |
| if self.exists(): | |
| subprocess.run(['tmux', 'kill-session', '-t', self.session_name]) | |
| class JarvisDaemon: | |
| """Main daemon orchestrating the voice assistant.""" | |
| def __init__(self, config: Config, passive: bool = False): | |
| self.config = config | |
| self.passive = passive | |
| self.state = DaemonState.IDLE | |
| self.running = True | |
| self.last_activity = time.time() | |
| # Initialize components | |
| self.audio = AudioCapture(config) | |
| self.wake_word = WakeWordDetector(config) | |
| self.whisper = WhisperClient(config) | |
| self.tts = TTS(config) | |
| self.tmux = TmuxSession(config) | |
| # Setup signal handlers | |
| signal.signal(signal.SIGINT, self._handle_signal) | |
| signal.signal(signal.SIGTERM, self._handle_signal) | |
| def _handle_signal(self, signum, frame): | |
| """Handle shutdown signals.""" | |
| logger.info(f"Received signal {signum}, shutting down...") | |
| self.running = False | |
| def _play_sound(self, sound_path: str): | |
| """Play a sound file.""" | |
| if os.path.exists(sound_path): | |
| subprocess.run(['afplay', sound_path], capture_output=True) | |
| def _check_end_phrase(self, text: str) -> bool: | |
| """Check if text contains an end phrase.""" | |
| text_lower = text.lower() | |
| return any(phrase in text_lower for phrase in self.config.end_phrases) | |
| def run(self): | |
| """Main daemon loop.""" | |
| logger.info("Jarvis daemon starting...") | |
| logger.info(f"Mode: {'passive' if self.passive else 'active'}") | |
| while self.running: | |
| try: | |
| if self.state == DaemonState.IDLE: | |
| self._handle_idle() | |
| elif self.state == DaemonState.LISTENING: | |
| self._handle_listening() | |
| elif self.state == DaemonState.PROCESSING: | |
| self._handle_processing() | |
| elif self.state == DaemonState.SPEAKING: | |
| self._handle_speaking() | |
| elif self.state == DaemonState.FOLLOW_UP: | |
| self._handle_follow_up() | |
| except Exception as e: | |
| logger.error(f"Error in state {self.state}: {e}") | |
| time.sleep(1) | |
| logger.info("Jarvis daemon stopped") | |
| def _handle_idle(self): | |
| """Wait for wake word.""" | |
| logger.info("Waiting for wake word...") | |
| detected, buffered_audio = self.wake_word.listen_for_wake_word() | |
| if detected: | |
| self._play_sound(self.config.sound_wake) | |
| self.last_activity = time.time() | |
| # If we have buffered audio from wake word detection, use it directly | |
| if len(buffered_audio) > 0: | |
| # Resample buffered audio to target sample rate | |
| resampled = self.audio._resample(buffered_audio) | |
| logger.info(f"Using {len(resampled) / self.config.target_sample_rate:.1f}s of wake-word buffered audio") | |
| self._current_audio = resampled | |
| self.state = DaemonState.PROCESSING | |
| else: | |
| # No buffered audio, go to listening mode | |
| self.state = DaemonState.LISTENING | |
| def _handle_listening(self): | |
| """Record user speech (only used when no buffered audio from wake word).""" | |
| if self.passive: | |
| logger.info("Passive mode, skipping audio capture") | |
| time.sleep(1) | |
| return | |
| self._play_sound(self.config.sound_listening) | |
| logger.info("Listening...") | |
| audio = self.audio.record_until_silence() | |
| # Check if we got meaningful audio (at least 0.3s with speech detected) | |
| min_audio_samples = int(self.config.target_sample_rate * 0.3) | |
| if len(audio) > min_audio_samples: | |
| self._current_audio = audio | |
| self.state = DaemonState.PROCESSING | |
| else: | |
| logger.warning("No meaningful audio captured, returning to idle") | |
| self.state = DaemonState.IDLE | |
| def _is_meaningful_transcription(self, text: str) -> bool: | |
| """Check if transcription contains actual speech content.""" | |
| if not text: | |
| logger.debug("Empty transcription") | |
| return False | |
| # Strip whitespace and common noise patterns | |
| cleaned = text.strip() | |
| # Too short to be meaningful (need at least 4 chars) | |
| if len(cleaned) < 4: | |
| logger.debug(f"Transcription too short: {len(cleaned)} chars") | |
| return False | |
| # Check if it's only asterisks, dots, or other noise characters | |
| if all(c in '*.-… \t\n' for c in cleaned): | |
| logger.debug(f"Transcription is only noise chars: {cleaned!r}") | |
| return False | |
| # Common "no speech" patterns from Whisper | |
| noise_patterns = [ | |
| '***', '****', '*****', '* * *', | |
| '...', '....', '…', '. . .', | |
| '[muziek]', '(muziek)', '*muziek*', '[music]', '[ music ]', | |
| '[stilte]', '(stilte)', '[silence]', '[ silence ]', | |
| '[geluid]', '(geluid)', '[sound]', | |
| '[applaus]', '(applaus)', | |
| 'you', 'thank you', 'thanks for watching', # Common Whisper hallucinations | |
| ] | |
| cleaned_lower = cleaned.lower().strip('*[]() .') | |
| for pattern in noise_patterns: | |
| pattern_clean = pattern.lower().strip('*[]() .') | |
| if cleaned_lower == pattern_clean: | |
| logger.debug(f"Transcription matches noise pattern: {pattern}") | |
| return False | |
| # Check if it's mostly punctuation/symbols (need at least 3 letters) | |
| alpha_count = sum(1 for c in cleaned if c.isalpha()) | |
| if alpha_count < 3: | |
| logger.debug(f"Transcription has too few letters: {alpha_count}") | |
| return False | |
| return True | |
| def _handle_processing(self): | |
| """Transcribe and send to Claude.""" | |
| text = self.whisper.transcribe(self._current_audio) | |
| if not text: | |
| logger.warning("No transcription result") | |
| self.state = DaemonState.FOLLOW_UP if self.tmux.exists() else DaemonState.IDLE | |
| return | |
| # Check if transcription is meaningful (not just noise) | |
| if not self._is_meaningful_transcription(text): | |
| logger.info(f"Ignoring noise/empty transcription: {text!r}") | |
| self.state = DaemonState.FOLLOW_UP if self.tmux.exists() else DaemonState.IDLE | |
| return | |
| # Check for end phrase | |
| if self._check_end_phrase(text): | |
| logger.info("End phrase detected, closing session") | |
| self.tts.speak("Tot ziens!") | |
| self.tmux.kill() | |
| self.state = DaemonState.IDLE | |
| return | |
| # Start or continue session | |
| if not self.tmux.exists(): | |
| self.tmux.start(text) | |
| else: | |
| self.tmux.send_input(text) | |
| self.state = DaemonState.SPEAKING | |
| self.last_activity = time.time() | |
| def _handle_speaking(self): | |
| """Wait for Claude response and speak it.""" | |
| # Wait for Claude to finish | |
| max_wait = 120 # Max wait for Claude response | |
| start_time = time.time() | |
| while not self.tmux.is_claude_ready(): | |
| if time.time() - start_time > max_wait: | |
| logger.warning("Timeout waiting for Claude response") | |
| break | |
| time.sleep(0.5) | |
| # Get the response (Claude outputs TTS via /jarvis skill) | |
| # The TTS is handled by the skill, so we just wait and move to follow-up | |
| time.sleep(1) # Give TTS time to complete | |
| self.state = DaemonState.FOLLOW_UP | |
| self.last_activity = time.time() | |
| def _handle_follow_up(self): | |
| """Wait for follow-up speech (no wake word needed).""" | |
| # Check for idle timeout | |
| if time.time() - self.last_activity > self.config.idle_timeout_seconds: | |
| logger.info("Idle timeout, returning to wake word mode") | |
| self.tts.speak("Ik ga slapen. Zeg Hey Jarvis om me te wekken.") | |
| self.state = DaemonState.IDLE | |
| return | |
| if self.passive: | |
| time.sleep(1) | |
| return | |
| # Listen for follow-up (shorter timeout, no wake word) | |
| logger.info("Listening for follow-up...") | |
| self._play_sound(self.config.sound_listening) | |
| audio = self.audio.record_until_silence(max_duration=10) | |
| if len(audio) > self.config.target_sample_rate * 0.5: # At least 0.5s of audio | |
| self._current_audio = audio | |
| self.state = DaemonState.PROCESSING | |
| else: | |
| # No speech detected, continue waiting | |
| time.sleep(0.5) | |
| def test_wake_word(config: Config): | |
| """Test wake word detection.""" | |
| print("Testing wake word detection. Say 'Hey Jarvis'...") | |
| detector = WakeWordDetector(config) | |
| if detector.listen_for_wake_word(timeout=30): | |
| print("Wake word detected!") | |
| else: | |
| print("Timeout, no wake word detected") | |
| def test_listen(config: Config): | |
| """Test audio capture and transcription.""" | |
| print("Testing audio capture. Speak something...") | |
| audio_capture = AudioCapture(config) | |
| whisper = WhisperClient(config) | |
| audio = audio_capture.record_until_silence() | |
| if len(audio) > 0: | |
| print(f"Captured {len(audio) / config.target_sample_rate:.1f}s of audio") | |
| text = whisper.transcribe(audio) | |
| print(f"Transcription: {text}") | |
| else: | |
| print("No audio captured") | |
| def write_pid(): | |
| """Write PID file.""" | |
| pid_file = Path.home() / '.local' / 'state' / 'jarvis' / 'jarvis.pid' | |
| pid_file.parent.mkdir(parents=True, exist_ok=True) | |
| pid_file.write_text(str(os.getpid())) | |
| def main(): | |
| parser = argparse.ArgumentParser(description='Jarvis Voice Assistant Daemon') | |
| parser.add_argument('--config', type=Path, | |
| default=Path.home() / '.config' / 'jarvis' / 'config.yaml', | |
| help='Path to config file') | |
| parser.add_argument('--test-wake', action='store_true', | |
| help='Test wake word detection') | |
| parser.add_argument('--test-listen', action='store_true', | |
| help='Test audio capture and transcription') | |
| parser.add_argument('--active', action='store_true', | |
| help='Run in active mode (voice enabled)') | |
| parser.add_argument('--passive', action='store_true', | |
| help='Run in passive mode (monitoring only)') | |
| parser.add_argument('--no-wake-word', action='store_true', | |
| help='Skip wake word, start listening immediately') | |
| args = parser.parse_args() | |
| # Load config | |
| config = Config.load(args.config) | |
| # Run tests if requested | |
| if args.test_wake: | |
| test_wake_word(config) | |
| return | |
| if args.test_listen: | |
| test_listen(config) | |
| return | |
| # Write PID file | |
| write_pid() | |
| # Create and run daemon | |
| daemon = JarvisDaemon(config, passive=args.passive) | |
| # Skip wake word if requested | |
| if args.no_wake_word: | |
| daemon.state = DaemonState.LISTENING | |
| daemon.run() | |
| if __name__ == '__main__': | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Start the Jarvis daemon in the background | |
| set -e | |
| PID_FILE="$HOME/.local/state/jarvis/jarvis.pid" | |
| LOG_FILE="$HOME/.local/state/jarvis/jarvis.log" | |
| # Check if already running | |
| if [ -f "$PID_FILE" ]; then | |
| PID=$(cat "$PID_FILE") | |
| if kill -0 "$PID" 2>/dev/null; then | |
| echo "Jarvis daemon is already running (PID: $PID)" | |
| exit 1 | |
| fi | |
| fi | |
| # Create log directory if needed | |
| mkdir -p "$(dirname "$LOG_FILE")" | |
| # Parse arguments | |
| MODE="" | |
| NO_WAKE="" | |
| while [[ $# -gt 0 ]]; do | |
| case $1 in | |
| --passive) | |
| MODE="--passive" | |
| shift | |
| ;; | |
| --active) | |
| MODE="--active" | |
| shift | |
| ;; | |
| --no-wake-word) | |
| NO_WAKE="--no-wake-word" | |
| shift | |
| ;; | |
| *) | |
| echo "Unknown option: $1" | |
| exit 1 | |
| ;; | |
| esac | |
| done | |
| echo "Starting Jarvis daemon..." | |
| nohup jarvis-daemon $MODE $NO_WAKE > "$LOG_FILE" 2>&1 & | |
| NOHUP_PID=$! | |
| # Wait for daemon to start and write PID file (may take a few seconds for model loading) | |
| echo "Waiting for daemon to initialize..." | |
| for i in {1..10}; do | |
| if [ -f "$PID_FILE" ]; then | |
| PID=$(cat "$PID_FILE") | |
| if kill -0 "$PID" 2>/dev/null; then | |
| echo "Jarvis daemon started (PID: $PID)" | |
| echo "Log file: $LOG_FILE" | |
| echo "" | |
| echo "Use 'jarvis-status' to check status" | |
| echo "Use 'jarvis-stop' to stop the daemon" | |
| echo "Use 'tmux attach -t jarvis' to view Claude session" | |
| exit 0 | |
| fi | |
| fi | |
| sleep 1 | |
| done | |
| # Check if nohup process is still running | |
| if kill -0 "$NOHUP_PID" 2>/dev/null; then | |
| echo "Daemon is starting but PID file not yet created." | |
| echo "Check progress with: tail -f $LOG_FILE" | |
| echo "Nohup PID: $NOHUP_PID" | |
| else | |
| echo "Failed to start Jarvis daemon. Check $LOG_FILE for errors." | |
| tail -10 "$LOG_FILE" | |
| exit 1 | |
| fi |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Check status of Jarvis daemon and session | |
| PID_FILE="$HOME/.local/state/jarvis/jarvis.pid" | |
| LOG_FILE="$HOME/.local/state/jarvis/jarvis.log" | |
| echo "=== Jarvis Voice Assistant Status ===" | |
| echo "" | |
| # Check daemon status | |
| echo "Daemon:" | |
| if [ -f "$PID_FILE" ]; then | |
| PID=$(cat "$PID_FILE") | |
| if kill -0 "$PID" 2>/dev/null; then | |
| echo " Status: Running (PID: $PID)" | |
| # Get process info | |
| ps -p "$PID" -o %cpu,%mem,etime | tail -1 | while read cpu mem time; do | |
| echo " CPU: $cpu% Memory: $mem% Uptime: $time" | |
| done | |
| else | |
| echo " Status: Not running (stale PID file)" | |
| fi | |
| else | |
| echo " Status: Not running" | |
| fi | |
| # Check tmux session | |
| echo "" | |
| echo "Claude Session (tmux):" | |
| if tmux has-session -t jarvis 2>/dev/null; then | |
| echo " Status: Active" | |
| echo " Attach: tmux attach -t jarvis" | |
| # Show last few lines of session | |
| echo "" | |
| echo " Last output:" | |
| tmux capture-pane -t jarvis -p -S -5 | sed 's/^/ /' | |
| else | |
| echo " Status: No active session" | |
| fi | |
| # Check audio devices | |
| echo "" | |
| echo "Audio:" | |
| if command -v python3 &>/dev/null; then | |
| python3 -c " | |
| import sounddevice as sd | |
| try: | |
| default_input = sd.query_devices(kind='input') | |
| print(f\" Input device: {default_input['name']}\") | |
| except Exception as e: | |
| print(f\" Input device: Error - {e}\") | |
| " 2>/dev/null || echo " Input device: Unable to query" | |
| fi | |
| # Show recent log entries | |
| if [ -f "$LOG_FILE" ]; then | |
| echo "" | |
| echo "Recent log entries:" | |
| tail -5 "$LOG_FILE" | sed 's/^/ /' | |
| fi | |
| echo "" | |
| echo "Commands:" | |
| echo " jarvis-start Start daemon" | |
| echo " jarvis-start --no-wake-word Start without wake word (immediate listen)" | |
| echo " jarvis-stop Stop daemon (keep tmux session)" | |
| echo " jarvis-stop --kill-session Stop daemon and kill session" | |
| echo " tmux attach -t jarvis View Claude session" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Stop the Jarvis daemon | |
| PID_FILE="$HOME/.local/state/jarvis/jarvis.pid" | |
| # Check for --kill-session flag | |
| KILL_SESSION=false | |
| while [[ $# -gt 0 ]]; do | |
| case $1 in | |
| --kill-session) | |
| KILL_SESSION=true | |
| shift | |
| ;; | |
| *) | |
| echo "Unknown option: $1" | |
| echo "Usage: jarvis-stop [--kill-session]" | |
| echo " --kill-session Also kill the tmux Claude session" | |
| exit 1 | |
| ;; | |
| esac | |
| done | |
| if [ ! -f "$PID_FILE" ]; then | |
| echo "No PID file found, checking for orphan processes..." | |
| # Find any running jarvis-daemon processes | |
| ORPHAN_PIDS=$(pgrep -f "jarvis-daemon" 2>/dev/null || true) | |
| if [ -n "$ORPHAN_PIDS" ]; then | |
| echo "Found orphan jarvis-daemon process(es): $ORPHAN_PIDS" | |
| for OPID in $ORPHAN_PIDS; do | |
| echo "Killing orphan PID: $OPID" | |
| kill "$OPID" 2>/dev/null || true | |
| done | |
| sleep 1 | |
| # Force kill if still running | |
| for OPID in $ORPHAN_PIDS; do | |
| if kill -0 "$OPID" 2>/dev/null; then | |
| echo "Force killing PID: $OPID" | |
| kill -9 "$OPID" 2>/dev/null || true | |
| fi | |
| done | |
| echo "Orphan processes cleaned up" | |
| else | |
| echo "Jarvis daemon is not running" | |
| fi | |
| # Still check for tmux session | |
| if tmux has-session -t jarvis 2>/dev/null; then | |
| if [ "$KILL_SESSION" = true ]; then | |
| tmux kill-session -t jarvis | |
| echo "Killed tmux jarvis session" | |
| else | |
| echo "Note: tmux jarvis session still exists (use --kill-session to remove)" | |
| fi | |
| fi | |
| exit 0 | |
| fi | |
| PID=$(cat "$PID_FILE") | |
| if kill -0 "$PID" 2>/dev/null; then | |
| echo "Stopping Jarvis daemon (PID: $PID)..." | |
| kill "$PID" | |
| # Wait for graceful shutdown | |
| for i in {1..10}; do | |
| if ! kill -0 "$PID" 2>/dev/null; then | |
| break | |
| fi | |
| sleep 0.5 | |
| done | |
| # Force kill if still running | |
| if kill -0 "$PID" 2>/dev/null; then | |
| echo "Force killing daemon..." | |
| kill -9 "$PID" | |
| fi | |
| echo "Jarvis daemon stopped" | |
| else | |
| echo "Jarvis daemon was not running (stale PID file)" | |
| fi | |
| # Clean up PID file | |
| rm -f "$PID_FILE" | |
| # Handle tmux session | |
| if tmux has-session -t jarvis 2>/dev/null; then | |
| if [ "$KILL_SESSION" = true ]; then | |
| tmux kill-session -t jarvis | |
| echo "Killed tmux jarvis session" | |
| else | |
| echo "Note: tmux jarvis session still exists" | |
| echo " - Use 'tmux attach -t jarvis' to resume manually" | |
| echo " - Use 'jarvis-stop --kill-session' to also kill it" | |
| fi | |
| fi |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Jarvis Voice Assistant Daemon Dependencies | |
| # Audio capture | |
| sounddevice>=0.4.6 | |
| numpy>=1.24.0 | |
| # Wake word detection | |
| openwakeword>=0.6.0 | |
| # Voice Activity Detection | |
| silero-vad>=4.0 | |
| # HTTP client for Whisper server | |
| requests>=2.31.0 | |
| # Local Whisper fallback | |
| faster-whisper>=1.0.0 | |
| # Config parsing | |
| PyYAML>=6.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment