Skip to content

Instantly share code, notes, and snippets.

@ErikDeBruijn
Last active January 17, 2026 21:01
Show Gist options
  • Select an option

  • Save ErikDeBruijn/395c92a03218e709aebe5d55fc33a126 to your computer and use it in GitHub Desktop.

Select an option

Save ErikDeBruijn/395c92a03218e709aebe5d55fc33a126 to your computer and use it in GitHub Desktop.
Jarvis Voice Assistant Daemon - Always-on voice daemon for Claude Code
# Jarvis Voice Assistant Daemon Configuration
wake_words: ["hey jarvis", "jarvis"]
language: "nl"
llm:
command: "/Users/erik/.claude/local/claude" # Volledige pad naar juiste versie
jarvis_prompt: "/jarvis" # activeer voice mode
extra_args: [] # bijv. ["--model", "opus"]
tmux_session: "jarvis" # tmux session name
whisper:
url: "http://10.1.1.64:8081/inference"
local_fallback: true
model: "base" # voor lokale fallback
audio:
device_name: "Yeti X" # Expliciet microfoon kiezen (of null voor default)
native_sample_rate: 48000 # Native sample rate van de microfoon
target_sample_rate: 16000 # Sample rate voor VAD/Whisper
channels: 1
silence_seconds: 1.5 # sec stilte = einde utterance
max_listen_seconds: 30 # max opnametijd
vad_threshold: 0.5 # Silero VAD threshold
session:
idle_timeout_seconds: 60 # Geen input -> sluit sessie
end_phrases: ["tot ziens", "bedankt jarvis", "klaar", "goodbye", "exit"]
tts:
command: "say"
voice: "Xander"
rate: 180
sounds:
wake: "~/.config/jarvis/sounds/wake.wav"
listening: "~/.config/jarvis/sounds/listening.wav"
#!/usr/bin/env python3
"""
Jarvis Voice Assistant Daemon
An always-on voice daemon that orchestrates Claude Code sessions.
Wake word triggers a new conversation, daemon manages the entire session
including follow-up responses.
Usage:
jarvis-daemon # Run in foreground
jarvis-daemon --test-wake # Test wake word detection
jarvis-daemon --test-listen # Test audio capture + VAD
jarvis-daemon --active # Active mode (voice enabled)
jarvis-daemon --passive # Passive mode (monitoring only)
"""
import argparse
import io
import logging
import os
import signal
import subprocess
import sys
import tempfile
import threading
import time
import wave
from dataclasses import dataclass
from enum import Enum, auto
from pathlib import Path
from typing import Optional
import numpy as np
import requests
import sounddevice as sd
import yaml
# Optional imports with graceful fallback
try:
import openwakeword
from openwakeword.model import Model as WakeWordModel
WAKE_WORD_AVAILABLE = True
except ImportError:
WAKE_WORD_AVAILABLE = False
logging.warning("openwakeword not installed, wake word detection disabled")
try:
import torch
SILERO_AVAILABLE = True
except ImportError:
SILERO_AVAILABLE = False
logging.warning("torch not installed, using simple VAD")
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class DaemonState(Enum):
IDLE = auto() # Waiting for wake word
LISTENING = auto() # Recording user speech
PROCESSING = auto() # Sending to Whisper/Claude
SPEAKING = auto() # TTS playing
FOLLOW_UP = auto() # Waiting for follow-up (no wake word needed)
@dataclass
class Config:
"""Configuration loaded from YAML."""
wake_words: list[str]
language: str
llm_command: str
llm_jarvis_prompt: str
llm_extra_args: list[str]
tmux_session: str
whisper_url: str
whisper_local_fallback: bool
device_name: Optional[str]
native_sample_rate: int
target_sample_rate: int
channels: int
silence_seconds: float
max_listen_seconds: float
vad_threshold: float
idle_timeout_seconds: int
end_phrases: list[str]
tts_command: str
tts_voice: str
tts_rate: int
sound_wake: str
sound_listening: str
@classmethod
def load(cls, path: Path) -> 'Config':
with open(path) as f:
data = yaml.safe_load(f)
return cls(
wake_words=data.get('wake_words', ['hey jarvis', 'jarvis']),
language=data.get('language', 'nl'),
llm_command=data.get('llm', {}).get('command', 'claude'),
llm_jarvis_prompt=data.get('llm', {}).get('jarvis_prompt', '/jarvis'),
llm_extra_args=data.get('llm', {}).get('extra_args', []),
tmux_session=data.get('llm', {}).get('tmux_session', 'jarvis'),
whisper_url=data.get('whisper', {}).get('url', 'http://10.1.1.64:8081/inference'),
whisper_local_fallback=data.get('whisper', {}).get('local_fallback', True),
device_name=data.get('audio', {}).get('device_name'),
native_sample_rate=data.get('audio', {}).get('native_sample_rate', 48000),
target_sample_rate=data.get('audio', {}).get('target_sample_rate', 16000),
channels=data.get('audio', {}).get('channels', 1),
silence_seconds=data.get('audio', {}).get('silence_seconds', 1.5),
max_listen_seconds=data.get('audio', {}).get('max_listen_seconds', 30),
vad_threshold=data.get('audio', {}).get('vad_threshold', 0.5),
idle_timeout_seconds=data.get('session', {}).get('idle_timeout_seconds', 60),
end_phrases=data.get('session', {}).get('end_phrases', ['tot ziens', 'bedankt jarvis', 'klaar']),
tts_command=data.get('tts', {}).get('command', 'say'),
tts_voice=data.get('tts', {}).get('voice', 'Xander'),
tts_rate=data.get('tts', {}).get('rate', 180),
sound_wake=os.path.expanduser(data.get('sounds', {}).get('wake', '~/.config/jarvis/sounds/wake.wav')),
sound_listening=os.path.expanduser(data.get('sounds', {}).get('listening', '~/.config/jarvis/sounds/listening.wav')),
)
class AudioCapture:
"""Handles audio capture with VAD (Voice Activity Detection)."""
# Silero VAD requires exactly 512 samples at 16kHz (32ms)
VAD_CHUNK_SAMPLES = 512
def __init__(self, config: Config):
self.config = config
self.vad_model = None
self.device_index = self._find_device()
self._load_vad()
def _find_device(self) -> Optional[int]:
"""Find audio device by name."""
if not self.config.device_name:
logger.info("Using default audio input device")
return None
devices = sd.query_devices()
for i, d in enumerate(devices):
if self.config.device_name in d['name'] and d['max_input_channels'] > 0:
logger.info(f"Using audio device [{i}]: {d['name']}")
return i
logger.warning(f"Device '{self.config.device_name}' not found, using default")
return None
def _resample(self, audio: np.ndarray) -> np.ndarray:
"""Resample audio from native to target sample rate."""
if self.config.native_sample_rate == self.config.target_sample_rate:
return audio
from scipy import signal
target_length = int(len(audio) * self.config.target_sample_rate / self.config.native_sample_rate)
return signal.resample(audio, target_length).astype(np.float32)
def _load_vad(self):
"""Load Silero VAD model."""
if SILERO_AVAILABLE:
try:
self.vad_model, utils = torch.hub.load(
repo_or_dir='snakers4/silero-vad',
model='silero_vad',
force_reload=False,
trust_repo=True
)
self.get_speech_timestamps = utils[0]
logger.info("Silero VAD loaded successfully")
except Exception as e:
logger.warning(f"Failed to load Silero VAD: {e}")
self.vad_model = None
def is_speech(self, audio_chunk_16k: np.ndarray) -> bool:
"""Check if audio chunk contains speech using VAD.
Args:
audio_chunk_16k: Audio chunk already at 16kHz sample rate
"""
if self.vad_model is None:
# Fallback: simple energy-based detection
energy = np.sqrt(np.mean(audio_chunk_16k ** 2))
return energy > 0.01
try:
# Silero VAD expects exactly 512 samples at 16kHz
# Process in 512-sample windows and return True if any has speech
chunk_size = self.VAD_CHUNK_SAMPLES
has_speech = False
for i in range(0, len(audio_chunk_16k) - chunk_size + 1, chunk_size):
window = audio_chunk_16k[i:i + chunk_size]
audio_tensor = torch.from_numpy(window.astype(np.float32))
speech_prob = self.vad_model(audio_tensor, 16000).item()
if speech_prob > self.config.vad_threshold:
has_speech = True
break
return has_speech
except Exception as e:
logger.warning(f"VAD error: {e}")
# Fallback to energy-based
energy = np.sqrt(np.mean(audio_chunk_16k ** 2))
return energy > 0.01
def record_until_silence(self, max_duration: Optional[float] = None, min_duration: float = 0.5) -> np.ndarray:
"""Record audio until silence is detected.
Records at native sample rate, performs real-time VAD at 16kHz,
returns audio resampled to target sample rate.
Args:
max_duration: Maximum recording duration in seconds
min_duration: Minimum recording duration before silence detection activates
Returns:
Audio array at target_sample_rate (16kHz for Whisper)
"""
max_duration = max_duration or self.config.max_listen_seconds
# Calculate chunk sizes for native and 16kHz
# We want ~32ms chunks (512 samples at 16kHz)
vad_chunk_duration = self.VAD_CHUNK_SAMPLES / 16000 # ~32ms
native_chunk_samples = int(self.config.native_sample_rate * vad_chunk_duration)
silence_chunks_needed = int(self.config.silence_seconds / vad_chunk_duration)
recorded_chunks = []
silence_count = 0
total_duration = 0
speech_started = False
speech_chunk_count = 0
# Reset VAD state before recording
if self.vad_model is not None:
self.vad_model.reset_states()
logger.info(f"Recording at {self.config.native_sample_rate}Hz... (waiting for speech)")
def audio_callback(indata, frames, time_info, status):
nonlocal silence_count, speech_started, total_duration, speech_chunk_count
if status:
logger.warning(f"Audio status: {status}")
chunk = indata[:, 0].copy()
recorded_chunks.append(chunk)
total_duration += vad_chunk_duration
# Resample chunk to 16kHz for VAD
chunk_16k = self._resample(chunk) if self.config.native_sample_rate != 16000 else chunk
if self.is_speech(chunk_16k):
if not speech_started:
logger.info("Speech detected!")
speech_started = True
speech_chunk_count += 1
silence_count = 0
elif speech_started:
silence_count += 1
with sd.InputStream(
samplerate=self.config.native_sample_rate,
channels=self.config.channels,
dtype=np.float32,
device=self.device_index,
blocksize=native_chunk_samples,
callback=audio_callback
):
while total_duration < max_duration:
# Only stop on silence if:
# 1. Speech was detected
# 2. We've recorded for at least min_duration
# 3. We've had enough silence chunks
can_stop = (
speech_started and
total_duration >= min_duration and
silence_count >= silence_chunks_needed
)
if can_stop:
logger.info(f"Silence detected after {speech_chunk_count} speech chunks, stopping")
break
time.sleep(vad_chunk_duration)
if recorded_chunks:
# Concatenate and resample to target sample rate
audio_native = np.concatenate(recorded_chunks)
audio = self._resample(audio_native)
duration = len(audio) / self.config.target_sample_rate
logger.info(f"Recorded {duration:.1f}s of audio (speech detected: {speech_started})")
return audio
return np.array([], dtype=np.float32)
class WakeWordDetector:
"""Detects wake words in audio stream."""
# OpenWakeWord expects 16kHz audio
WAKE_WORD_SAMPLE_RATE = 16000
def __init__(self, config: Config):
self.config = config
self.model = None
self.device_index = self._find_device()
self._load_model()
def _find_device(self) -> Optional[int]:
"""Find audio device by name."""
if not self.config.device_name:
return None
devices = sd.query_devices()
for i, d in enumerate(devices):
if self.config.device_name in d['name'] and d['max_input_channels'] > 0:
return i
return None
def _load_model(self):
"""Load OpenWakeWord model."""
if not WAKE_WORD_AVAILABLE:
logger.warning("Wake word detection not available")
return
try:
# Download and load the "hey jarvis" model
openwakeword.utils.download_models(['hey_jarvis_v0.1'])
self.model = WakeWordModel(
wakeword_models=['hey_jarvis_v0.1'],
inference_framework='onnx'
)
logger.info("Wake word model loaded successfully")
except Exception as e:
logger.error(f"Failed to load wake word model: {e}")
self.model = None
def listen_for_wake_word(self, timeout: Optional[float] = None) -> tuple[bool, np.ndarray]:
"""Listen for wake word, return (detected, post_wake_audio).
Returns:
Tuple of (wake_word_detected, audio_buffer)
The audio_buffer contains audio captured AFTER wake word detection,
which should be prepended to the main recording.
"""
if self.model is None:
logger.warning("Wake word model not loaded, triggering immediately")
return True, np.array([], dtype=np.float32)
# OpenWakeWord expects 16kHz, 80ms chunks (1280 samples)
chunk_duration = 0.08
chunk_samples_16k = int(self.WAKE_WORD_SAMPLE_RATE * chunk_duration)
# If recording at native rate, calculate native chunk size
native_rate = self.config.native_sample_rate
chunk_samples_native = int(native_rate * chunk_duration)
detected = threading.Event()
start_time = time.time()
# Buffer to store audio chunks (native rate)
audio_buffer = []
wake_word_time = [None] # Use list to allow modification in callback
POST_WAKE_BUFFER_SECONDS = 3.0 # Continue capturing for 3s after wake word
def audio_callback(indata, frames, time_info, status):
if status:
logger.warning(f"Audio status: {status}")
chunk_native = indata[:, 0].copy()
# Always buffer audio (keep last ~5 seconds before wake word)
audio_buffer.append(chunk_native)
max_buffer_chunks = int(5.0 / chunk_duration)
if len(audio_buffer) > max_buffer_chunks and wake_word_time[0] is None:
audio_buffer.pop(0)
# Resample to 16kHz for wake word detection
if native_rate != self.WAKE_WORD_SAMPLE_RATE:
from scipy import signal
target_len = int(len(chunk_native) * self.WAKE_WORD_SAMPLE_RATE / native_rate)
chunk_16k = signal.resample(chunk_native, target_len).astype(np.float32)
else:
chunk_16k = chunk_native
# Convert to int16 for openwakeword
audio_int16 = (chunk_16k * 32767).astype(np.int16)
prediction = self.model.predict(audio_int16)
# Check all wake word scores
for model_name, score in prediction.items():
if score > 0.5 and wake_word_time[0] is None:
logger.info(f"Wake word detected: {model_name} (score: {score:.2f})")
wake_word_time[0] = time.time()
detected.set()
try:
with sd.InputStream(
samplerate=native_rate,
channels=self.config.channels,
dtype=np.float32,
device=self.device_index,
blocksize=chunk_samples_native,
callback=audio_callback
):
# Wait for wake word
while not detected.is_set():
if timeout and (time.time() - start_time) > timeout:
return False, np.array([], dtype=np.float32)
time.sleep(0.05)
# Continue capturing audio after wake word for a bit
# This captures the rest of the user's sentence
logger.info(f"Buffering post-wake-word audio for {POST_WAKE_BUFFER_SECONDS}s...")
post_wake_start = time.time()
while time.time() - post_wake_start < POST_WAKE_BUFFER_SECONDS:
time.sleep(0.05)
except Exception as e:
logger.error(f"Wake word detection error: {e}")
return False, np.array([], dtype=np.float32)
# Return the buffered audio (at native sample rate)
if audio_buffer:
buffered_audio = np.concatenate(audio_buffer)
logger.info(f"Captured {len(buffered_audio) / native_rate:.1f}s of buffered audio")
return True, buffered_audio
return True, np.array([], dtype=np.float32)
class WhisperClient:
"""Transcribe audio using Whisper server or local fallback."""
def __init__(self, config: Config):
self.config = config
self.local_model = None
def transcribe(self, audio: np.ndarray) -> str:
"""Transcribe audio to text."""
# Try remote Whisper server first
try:
return self._transcribe_remote(audio)
except Exception as e:
logger.warning(f"Remote Whisper failed: {e}")
# Fallback to local if enabled
if self.config.whisper_local_fallback:
return self._transcribe_local(audio)
return ""
def _transcribe_remote(self, audio: np.ndarray) -> str:
"""Transcribe using remote Whisper server."""
# Convert to WAV bytes
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, 'wb') as wav_file:
wav_file.setnchannels(self.config.channels)
wav_file.setsampwidth(2) # 16-bit
wav_file.setframerate(self.config.target_sample_rate)
wav_file.writeframes((audio * 32767).astype(np.int16).tobytes())
wav_buffer.seek(0)
response = requests.post(
self.config.whisper_url,
files={'file': ('audio.wav', wav_buffer, 'audio/wav')},
data={
'language': self.config.language,
'response_format': 'json'
},
timeout=30
)
response.raise_for_status()
result = response.json()
text = result.get('text', '').strip()
logger.info(f"Transcription: {text}")
return text
def _transcribe_local(self, audio: np.ndarray) -> str:
"""Transcribe using local faster-whisper."""
try:
from faster_whisper import WhisperModel
if self.local_model is None:
logger.info("Loading local Whisper model...")
self.local_model = WhisperModel("base", compute_type="int8")
segments, _ = self.local_model.transcribe(
audio,
language=self.config.language[:2], # 'nl' from 'nl'
beam_size=5
)
text = " ".join(segment.text for segment in segments).strip()
logger.info(f"Local transcription: {text}")
return text
except Exception as e:
logger.error(f"Local Whisper failed: {e}")
return ""
class TTS:
"""Text-to-Speech using macOS say command."""
def __init__(self, config: Config):
self.config = config
def speak(self, text: str):
"""Speak text using TTS (blocking)."""
if not text:
return
logger.info(f"Speaking: {text[:50]}...")
try:
subprocess.run([
self.config.tts_command,
'-v', self.config.tts_voice,
'-r', str(self.config.tts_rate),
text
], check=True)
except subprocess.CalledProcessError as e:
logger.error(f"TTS failed: {e}")
class TmuxSession:
"""Manage Claude Code session via tmux."""
def __init__(self, config: Config):
self.config = config
self.session_name = config.tmux_session
self._last_output_line = 0
def exists(self) -> bool:
"""Check if tmux session exists."""
result = subprocess.run(
['tmux', 'has-session', '-t', self.session_name],
capture_output=True
)
return result.returncode == 0
def start(self, initial_text: str):
"""Start a new Claude session in tmux."""
if self.exists():
logger.info(f"Killing existing tmux session: {self.session_name}")
subprocess.run(['tmux', 'kill-session', '-t', self.session_name])
# Start Claude interactively (no -p flag) so session is resumable
import shlex
cmd_parts = [self.config.llm_command]
cmd_parts.extend(self.config.llm_extra_args)
cmd = shlex.join(cmd_parts)
logger.info(f"Starting tmux session: {self.session_name}")
logger.info(f"Command: {cmd}")
# Start Claude in interactive mode
subprocess.run([
'tmux', 'new-session', '-d', '-s', self.session_name, cmd
], check=True)
self._last_output_line = 0
time.sleep(2) # Give Claude time to start and show prompt
# Now send the initial prompt via tmux send-keys
prompt_text = f'{self.config.llm_jarvis_prompt} {initial_text}'
logger.info(f"Sending initial prompt: {prompt_text[:80]}...")
self.send_input(prompt_text)
def send_input(self, text: str):
"""Send text input to the Claude session."""
if not self.exists():
logger.warning("No tmux session to send input to")
return
logger.info(f"Sending to Claude: {text[:50]}...")
subprocess.run([
'tmux', 'send-keys', '-t', self.session_name, text, 'Enter'
], check=True)
def get_new_output(self) -> str:
"""Get new output from the session since last check."""
if not self.exists():
return ""
result = subprocess.run(
['tmux', 'capture-pane', '-t', self.session_name, '-p', '-S', '-1000'],
capture_output=True,
text=True
)
if result.returncode != 0:
return ""
lines = result.stdout.split('\n')
new_lines = lines[self._last_output_line:]
self._last_output_line = len(lines)
return '\n'.join(new_lines).strip()
def is_claude_ready(self) -> bool:
"""Check if Claude has finished responding (waiting for input)."""
if not self.exists():
return False
# Capture the last few lines to check for the prompt
result = subprocess.run(
['tmux', 'capture-pane', '-t', self.session_name, '-p', '-S', '-5'],
capture_output=True,
text=True
)
output = result.stdout.strip()
# Claude shows '>' when ready for input
return output.endswith('>') or '>' in output.split('\n')[-1]
def kill(self):
"""Kill the tmux session."""
if self.exists():
subprocess.run(['tmux', 'kill-session', '-t', self.session_name])
class JarvisDaemon:
"""Main daemon orchestrating the voice assistant."""
def __init__(self, config: Config, passive: bool = False):
self.config = config
self.passive = passive
self.state = DaemonState.IDLE
self.running = True
self.last_activity = time.time()
# Initialize components
self.audio = AudioCapture(config)
self.wake_word = WakeWordDetector(config)
self.whisper = WhisperClient(config)
self.tts = TTS(config)
self.tmux = TmuxSession(config)
# Setup signal handlers
signal.signal(signal.SIGINT, self._handle_signal)
signal.signal(signal.SIGTERM, self._handle_signal)
def _handle_signal(self, signum, frame):
"""Handle shutdown signals."""
logger.info(f"Received signal {signum}, shutting down...")
self.running = False
def _play_sound(self, sound_path: str):
"""Play a sound file."""
if os.path.exists(sound_path):
subprocess.run(['afplay', sound_path], capture_output=True)
def _check_end_phrase(self, text: str) -> bool:
"""Check if text contains an end phrase."""
text_lower = text.lower()
return any(phrase in text_lower for phrase in self.config.end_phrases)
def run(self):
"""Main daemon loop."""
logger.info("Jarvis daemon starting...")
logger.info(f"Mode: {'passive' if self.passive else 'active'}")
while self.running:
try:
if self.state == DaemonState.IDLE:
self._handle_idle()
elif self.state == DaemonState.LISTENING:
self._handle_listening()
elif self.state == DaemonState.PROCESSING:
self._handle_processing()
elif self.state == DaemonState.SPEAKING:
self._handle_speaking()
elif self.state == DaemonState.FOLLOW_UP:
self._handle_follow_up()
except Exception as e:
logger.error(f"Error in state {self.state}: {e}")
time.sleep(1)
logger.info("Jarvis daemon stopped")
def _handle_idle(self):
"""Wait for wake word."""
logger.info("Waiting for wake word...")
detected, buffered_audio = self.wake_word.listen_for_wake_word()
if detected:
self._play_sound(self.config.sound_wake)
self.last_activity = time.time()
# If we have buffered audio from wake word detection, use it directly
if len(buffered_audio) > 0:
# Resample buffered audio to target sample rate
resampled = self.audio._resample(buffered_audio)
logger.info(f"Using {len(resampled) / self.config.target_sample_rate:.1f}s of wake-word buffered audio")
self._current_audio = resampled
self.state = DaemonState.PROCESSING
else:
# No buffered audio, go to listening mode
self.state = DaemonState.LISTENING
def _handle_listening(self):
"""Record user speech (only used when no buffered audio from wake word)."""
if self.passive:
logger.info("Passive mode, skipping audio capture")
time.sleep(1)
return
self._play_sound(self.config.sound_listening)
logger.info("Listening...")
audio = self.audio.record_until_silence()
# Check if we got meaningful audio (at least 0.3s with speech detected)
min_audio_samples = int(self.config.target_sample_rate * 0.3)
if len(audio) > min_audio_samples:
self._current_audio = audio
self.state = DaemonState.PROCESSING
else:
logger.warning("No meaningful audio captured, returning to idle")
self.state = DaemonState.IDLE
def _is_meaningful_transcription(self, text: str) -> bool:
"""Check if transcription contains actual speech content."""
if not text:
logger.debug("Empty transcription")
return False
# Strip whitespace and common noise patterns
cleaned = text.strip()
# Too short to be meaningful (need at least 4 chars)
if len(cleaned) < 4:
logger.debug(f"Transcription too short: {len(cleaned)} chars")
return False
# Check if it's only asterisks, dots, or other noise characters
if all(c in '*.-… \t\n' for c in cleaned):
logger.debug(f"Transcription is only noise chars: {cleaned!r}")
return False
# Common "no speech" patterns from Whisper
noise_patterns = [
'***', '****', '*****', '* * *',
'...', '....', '…', '. . .',
'[muziek]', '(muziek)', '*muziek*', '[music]', '[ music ]',
'[stilte]', '(stilte)', '[silence]', '[ silence ]',
'[geluid]', '(geluid)', '[sound]',
'[applaus]', '(applaus)',
'you', 'thank you', 'thanks for watching', # Common Whisper hallucinations
]
cleaned_lower = cleaned.lower().strip('*[]() .')
for pattern in noise_patterns:
pattern_clean = pattern.lower().strip('*[]() .')
if cleaned_lower == pattern_clean:
logger.debug(f"Transcription matches noise pattern: {pattern}")
return False
# Check if it's mostly punctuation/symbols (need at least 3 letters)
alpha_count = sum(1 for c in cleaned if c.isalpha())
if alpha_count < 3:
logger.debug(f"Transcription has too few letters: {alpha_count}")
return False
return True
def _handle_processing(self):
"""Transcribe and send to Claude."""
text = self.whisper.transcribe(self._current_audio)
if not text:
logger.warning("No transcription result")
self.state = DaemonState.FOLLOW_UP if self.tmux.exists() else DaemonState.IDLE
return
# Check if transcription is meaningful (not just noise)
if not self._is_meaningful_transcription(text):
logger.info(f"Ignoring noise/empty transcription: {text!r}")
self.state = DaemonState.FOLLOW_UP if self.tmux.exists() else DaemonState.IDLE
return
# Check for end phrase
if self._check_end_phrase(text):
logger.info("End phrase detected, closing session")
self.tts.speak("Tot ziens!")
self.tmux.kill()
self.state = DaemonState.IDLE
return
# Start or continue session
if not self.tmux.exists():
self.tmux.start(text)
else:
self.tmux.send_input(text)
self.state = DaemonState.SPEAKING
self.last_activity = time.time()
def _handle_speaking(self):
"""Wait for Claude response and speak it."""
# Wait for Claude to finish
max_wait = 120 # Max wait for Claude response
start_time = time.time()
while not self.tmux.is_claude_ready():
if time.time() - start_time > max_wait:
logger.warning("Timeout waiting for Claude response")
break
time.sleep(0.5)
# Get the response (Claude outputs TTS via /jarvis skill)
# The TTS is handled by the skill, so we just wait and move to follow-up
time.sleep(1) # Give TTS time to complete
self.state = DaemonState.FOLLOW_UP
self.last_activity = time.time()
def _handle_follow_up(self):
"""Wait for follow-up speech (no wake word needed)."""
# Check for idle timeout
if time.time() - self.last_activity > self.config.idle_timeout_seconds:
logger.info("Idle timeout, returning to wake word mode")
self.tts.speak("Ik ga slapen. Zeg Hey Jarvis om me te wekken.")
self.state = DaemonState.IDLE
return
if self.passive:
time.sleep(1)
return
# Listen for follow-up (shorter timeout, no wake word)
logger.info("Listening for follow-up...")
self._play_sound(self.config.sound_listening)
audio = self.audio.record_until_silence(max_duration=10)
if len(audio) > self.config.target_sample_rate * 0.5: # At least 0.5s of audio
self._current_audio = audio
self.state = DaemonState.PROCESSING
else:
# No speech detected, continue waiting
time.sleep(0.5)
def test_wake_word(config: Config):
"""Test wake word detection."""
print("Testing wake word detection. Say 'Hey Jarvis'...")
detector = WakeWordDetector(config)
if detector.listen_for_wake_word(timeout=30):
print("Wake word detected!")
else:
print("Timeout, no wake word detected")
def test_listen(config: Config):
"""Test audio capture and transcription."""
print("Testing audio capture. Speak something...")
audio_capture = AudioCapture(config)
whisper = WhisperClient(config)
audio = audio_capture.record_until_silence()
if len(audio) > 0:
print(f"Captured {len(audio) / config.target_sample_rate:.1f}s of audio")
text = whisper.transcribe(audio)
print(f"Transcription: {text}")
else:
print("No audio captured")
def write_pid():
"""Write PID file."""
pid_file = Path.home() / '.local' / 'state' / 'jarvis' / 'jarvis.pid'
pid_file.parent.mkdir(parents=True, exist_ok=True)
pid_file.write_text(str(os.getpid()))
def main():
parser = argparse.ArgumentParser(description='Jarvis Voice Assistant Daemon')
parser.add_argument('--config', type=Path,
default=Path.home() / '.config' / 'jarvis' / 'config.yaml',
help='Path to config file')
parser.add_argument('--test-wake', action='store_true',
help='Test wake word detection')
parser.add_argument('--test-listen', action='store_true',
help='Test audio capture and transcription')
parser.add_argument('--active', action='store_true',
help='Run in active mode (voice enabled)')
parser.add_argument('--passive', action='store_true',
help='Run in passive mode (monitoring only)')
parser.add_argument('--no-wake-word', action='store_true',
help='Skip wake word, start listening immediately')
args = parser.parse_args()
# Load config
config = Config.load(args.config)
# Run tests if requested
if args.test_wake:
test_wake_word(config)
return
if args.test_listen:
test_listen(config)
return
# Write PID file
write_pid()
# Create and run daemon
daemon = JarvisDaemon(config, passive=args.passive)
# Skip wake word if requested
if args.no_wake_word:
daemon.state = DaemonState.LISTENING
daemon.run()
if __name__ == '__main__':
main()
#!/bin/bash
# Start the Jarvis daemon in the background
set -e
PID_FILE="$HOME/.local/state/jarvis/jarvis.pid"
LOG_FILE="$HOME/.local/state/jarvis/jarvis.log"
# Check if already running
if [ -f "$PID_FILE" ]; then
PID=$(cat "$PID_FILE")
if kill -0 "$PID" 2>/dev/null; then
echo "Jarvis daemon is already running (PID: $PID)"
exit 1
fi
fi
# Create log directory if needed
mkdir -p "$(dirname "$LOG_FILE")"
# Parse arguments
MODE=""
NO_WAKE=""
while [[ $# -gt 0 ]]; do
case $1 in
--passive)
MODE="--passive"
shift
;;
--active)
MODE="--active"
shift
;;
--no-wake-word)
NO_WAKE="--no-wake-word"
shift
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done
echo "Starting Jarvis daemon..."
nohup jarvis-daemon $MODE $NO_WAKE > "$LOG_FILE" 2>&1 &
NOHUP_PID=$!
# Wait for daemon to start and write PID file (may take a few seconds for model loading)
echo "Waiting for daemon to initialize..."
for i in {1..10}; do
if [ -f "$PID_FILE" ]; then
PID=$(cat "$PID_FILE")
if kill -0 "$PID" 2>/dev/null; then
echo "Jarvis daemon started (PID: $PID)"
echo "Log file: $LOG_FILE"
echo ""
echo "Use 'jarvis-status' to check status"
echo "Use 'jarvis-stop' to stop the daemon"
echo "Use 'tmux attach -t jarvis' to view Claude session"
exit 0
fi
fi
sleep 1
done
# Check if nohup process is still running
if kill -0 "$NOHUP_PID" 2>/dev/null; then
echo "Daemon is starting but PID file not yet created."
echo "Check progress with: tail -f $LOG_FILE"
echo "Nohup PID: $NOHUP_PID"
else
echo "Failed to start Jarvis daemon. Check $LOG_FILE for errors."
tail -10 "$LOG_FILE"
exit 1
fi
#!/bin/bash
# Check status of Jarvis daemon and session
PID_FILE="$HOME/.local/state/jarvis/jarvis.pid"
LOG_FILE="$HOME/.local/state/jarvis/jarvis.log"
echo "=== Jarvis Voice Assistant Status ==="
echo ""
# Check daemon status
echo "Daemon:"
if [ -f "$PID_FILE" ]; then
PID=$(cat "$PID_FILE")
if kill -0 "$PID" 2>/dev/null; then
echo " Status: Running (PID: $PID)"
# Get process info
ps -p "$PID" -o %cpu,%mem,etime | tail -1 | while read cpu mem time; do
echo " CPU: $cpu% Memory: $mem% Uptime: $time"
done
else
echo " Status: Not running (stale PID file)"
fi
else
echo " Status: Not running"
fi
# Check tmux session
echo ""
echo "Claude Session (tmux):"
if tmux has-session -t jarvis 2>/dev/null; then
echo " Status: Active"
echo " Attach: tmux attach -t jarvis"
# Show last few lines of session
echo ""
echo " Last output:"
tmux capture-pane -t jarvis -p -S -5 | sed 's/^/ /'
else
echo " Status: No active session"
fi
# Check audio devices
echo ""
echo "Audio:"
if command -v python3 &>/dev/null; then
python3 -c "
import sounddevice as sd
try:
default_input = sd.query_devices(kind='input')
print(f\" Input device: {default_input['name']}\")
except Exception as e:
print(f\" Input device: Error - {e}\")
" 2>/dev/null || echo " Input device: Unable to query"
fi
# Show recent log entries
if [ -f "$LOG_FILE" ]; then
echo ""
echo "Recent log entries:"
tail -5 "$LOG_FILE" | sed 's/^/ /'
fi
echo ""
echo "Commands:"
echo " jarvis-start Start daemon"
echo " jarvis-start --no-wake-word Start without wake word (immediate listen)"
echo " jarvis-stop Stop daemon (keep tmux session)"
echo " jarvis-stop --kill-session Stop daemon and kill session"
echo " tmux attach -t jarvis View Claude session"
#!/bin/bash
# Stop the Jarvis daemon
PID_FILE="$HOME/.local/state/jarvis/jarvis.pid"
# Check for --kill-session flag
KILL_SESSION=false
while [[ $# -gt 0 ]]; do
case $1 in
--kill-session)
KILL_SESSION=true
shift
;;
*)
echo "Unknown option: $1"
echo "Usage: jarvis-stop [--kill-session]"
echo " --kill-session Also kill the tmux Claude session"
exit 1
;;
esac
done
if [ ! -f "$PID_FILE" ]; then
echo "No PID file found, checking for orphan processes..."
# Find any running jarvis-daemon processes
ORPHAN_PIDS=$(pgrep -f "jarvis-daemon" 2>/dev/null || true)
if [ -n "$ORPHAN_PIDS" ]; then
echo "Found orphan jarvis-daemon process(es): $ORPHAN_PIDS"
for OPID in $ORPHAN_PIDS; do
echo "Killing orphan PID: $OPID"
kill "$OPID" 2>/dev/null || true
done
sleep 1
# Force kill if still running
for OPID in $ORPHAN_PIDS; do
if kill -0 "$OPID" 2>/dev/null; then
echo "Force killing PID: $OPID"
kill -9 "$OPID" 2>/dev/null || true
fi
done
echo "Orphan processes cleaned up"
else
echo "Jarvis daemon is not running"
fi
# Still check for tmux session
if tmux has-session -t jarvis 2>/dev/null; then
if [ "$KILL_SESSION" = true ]; then
tmux kill-session -t jarvis
echo "Killed tmux jarvis session"
else
echo "Note: tmux jarvis session still exists (use --kill-session to remove)"
fi
fi
exit 0
fi
PID=$(cat "$PID_FILE")
if kill -0 "$PID" 2>/dev/null; then
echo "Stopping Jarvis daemon (PID: $PID)..."
kill "$PID"
# Wait for graceful shutdown
for i in {1..10}; do
if ! kill -0 "$PID" 2>/dev/null; then
break
fi
sleep 0.5
done
# Force kill if still running
if kill -0 "$PID" 2>/dev/null; then
echo "Force killing daemon..."
kill -9 "$PID"
fi
echo "Jarvis daemon stopped"
else
echo "Jarvis daemon was not running (stale PID file)"
fi
# Clean up PID file
rm -f "$PID_FILE"
# Handle tmux session
if tmux has-session -t jarvis 2>/dev/null; then
if [ "$KILL_SESSION" = true ]; then
tmux kill-session -t jarvis
echo "Killed tmux jarvis session"
else
echo "Note: tmux jarvis session still exists"
echo " - Use 'tmux attach -t jarvis' to resume manually"
echo " - Use 'jarvis-stop --kill-session' to also kill it"
fi
fi
# Jarvis Voice Assistant Daemon Dependencies
# Audio capture
sounddevice>=0.4.6
numpy>=1.24.0
# Wake word detection
openwakeword>=0.6.0
# Voice Activity Detection
silero-vad>=4.0
# HTTP client for Whisper server
requests>=2.31.0
# Local Whisper fallback
faster-whisper>=1.0.0
# Config parsing
PyYAML>=6.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment