Last active
March 4, 2026 08:56
-
-
Save jerlendds/d01e06ea6f5b1282dad9d65f7c63a6d1 to your computer and use it in GitHub Desktop.
~/.config/voxtype/config.toml
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Voxtype Configuration | |
| # | |
| # Location: ~/.config/voxtype/config.toml | |
| # All settings can be overridden via CLI flags | |
| # State file for external integrations (Waybar, polybar, etc.) | |
| # Use "auto" for default location ($XDG_RUNTIME_DIR/voxtype/state), | |
| # a custom path, or "disabled" to turn off. The daemon writes state | |
| # ("idle", "recording", "transcribing") to this file whenever it changes. | |
| # Required for `voxtype record toggle` and `voxtype status` commands. | |
| state_file = "auto" | |
| [hotkey] | |
| # Key to hold for push-to-talk | |
| # Common choices: SCROLLLOCK, PAUSE, RIGHTALT, F13-F24 | |
| # Use `evtest` to find key names for your keyboard | |
| key = "KEY_F9" | |
| # Optional modifier keys that must also be held | |
| # Example: modifiers = ["LEFTCTRL", "LEFTALT"] | |
| modifiers = [] | |
| # Activation mode: "push_to_talk" or "toggle" | |
| # - push_to_talk: Hold hotkey to record, release to transcribe (default) | |
| # - toggle: Press hotkey once to start recording, press again to stop | |
| # mode = "push_to_talk" | |
| # Enable built-in hotkey detection (default: true) | |
| # Set to false when using compositor keybindings (Hyprland, Sway) instead | |
| # When disabled, use `voxtype record start/stop/toggle` to control recording | |
| # enabled = true | |
| # Modifier key to select secondary model (evdev input mode only) | |
| # When held while pressing the hotkey, uses whisper.secondary_model instead | |
| # Example: model_modifier = "LEFTSHIFT" # Shift+hotkey uses secondary model | |
| # model_modifier = "LEFTSHIFT" | |
| [audio] | |
| # Audio input device ("default" uses system default) | |
| # List devices with: pactl list sources short | |
| device = "default" | |
| # Sample rate in Hz (whisper expects 16000) | |
| sample_rate = 16000 | |
| # Maximum recording duration in seconds (safety limit) | |
| max_duration_secs = 360 | |
| # [audio.feedback] | |
| # Enable audio feedback sounds (beeps when recording starts/stops) | |
| # enabled = true | |
| # | |
| # Sound theme: "default", "subtle", "mechanical", or path to custom theme directory | |
| # theme = "default" | |
| # | |
| # Volume level (0.0 to 1.0) | |
| # volume = 0.7 | |
| [whisper] | |
| # Transcription backend: "local" or "remote" | |
| # - local: Use whisper.cpp locally (default) | |
| # - remote: Send audio to a remote whisper.cpp server or OpenAI-compatible API | |
| # backend = "local" | |
| # Model to use for transcription (local backend) | |
| # Options: tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v3, large-v3-turbo | |
| # .en models are English-only but faster and more accurate for English | |
| # large-v3-turbo is faster than large-v3 with minimal accuracy loss (recommended for GPU) | |
| # Or provide absolute path to a custom .bin model file | |
| model = "large-v3-turbo" | |
| # Language for transcription | |
| # Options: | |
| # - Single language: "en", "fr", "de", etc. | |
| # - Auto-detect all: "auto" | |
| # - Constrained auto-detect: ["en", "fr"] (detects from allowed set only) | |
| # The array form helps with multilingual users where Whisper might misdetect | |
| # the language, especially for short sentences. | |
| # See: https://github.com/openai/whisper#available-models-and-languages | |
| language = "en" | |
| # Translate non-English speech to English | |
| translate = true | |
| # Number of CPU threads for inference (omit for auto-detection) | |
| # threads = 4 | |
| # Initial prompt to provide context for transcription | |
| # Use this to hint at terminology, proper nouns, or formatting conventions. | |
| # Example: "Technical discussion about Rust, TypeScript, and Kubernetes." | |
| # initial_prompt = "" | |
| # --- Multi-model settings --- | |
| # | |
| # Secondary model for difficult audio (used with hotkey.model_modifier or CLI --model) | |
| # secondary_model = "large-v3-turbo" | |
| # | |
| # List of available models that can be requested via CLI --model flag | |
| # available_models = ["large-v3-turbo", "medium.en"] | |
| # | |
| # Maximum models to keep loaded in memory (LRU eviction when exceeded) | |
| # Default: 2 (primary + one secondary). Only applies when gpu_isolation = false. | |
| # max_loaded_models = 2 | |
| # | |
| # Seconds before unloading idle secondary models (0 = never auto-unload) | |
| # Default: 300 (5 minutes). Only applies when gpu_isolation = false. | |
| # cold_model_timeout_secs = 300 | |
| # --- Eager processing settings --- | |
| # | |
| # Enable eager input processing (transcribe chunks while recording continues) | |
| # Reduces perceived latency on slower machines by processing audio in parallel. | |
| # eager_processing = false | |
| # | |
| # Duration of each audio chunk in seconds (default: 5.0) | |
| # eager_chunk_secs = 5.0 | |
| # | |
| # Overlap between chunks in seconds (helps catch words at boundaries, default: 0.5) | |
| # eager_overlap_secs = 0.5 | |
| # --- Remote backend settings (used when backend = "remote") --- | |
| # | |
| # Remote server endpoint URL (required for remote backend) | |
| # Examples: | |
| # - whisper.cpp server: "http://192.168.1.100:8080" | |
| # - OpenAI API: "https://api.openai.com" | |
| # remote_endpoint = "http://192.168.1.100:8080" | |
| # | |
| # Model name to send to remote server (default: "whisper-1") | |
| # remote_model = "whisper-1" | |
| # | |
| # API key for remote server (optional, or use VOXTYPE_WHISPER_API_KEY env var) | |
| # remote_api_key = "" | |
| # | |
| # Timeout for remote requests in seconds (default: 30) | |
| # remote_timeout_secs = 30 | |
| [output] | |
| # Primary output mode: "type" or "clipboard" | |
| # - type: Simulates keyboard input at cursor position (requires ydotool) | |
| # - clipboard: Copies text to clipboard (requires wl-copy) | |
| mode = "type" | |
| # Fall back to clipboard if typing fails | |
| fallback_to_clipboard = true | |
| # Custom driver order for type mode (optional) | |
| # Default order: wtype -> dotool -> ydotool -> clipboard | |
| # Customize to prefer a specific driver or change the fallback order. | |
| # Available drivers: wtype, dotool, ydotool, clipboard | |
| # Example: prefer ydotool over dotool: | |
| # driver_order = ["wtype", "ydotool", "dotool", "clipboard"] | |
| # Example: use only ydotool, no fallback: | |
| driver_order = ["ydotool", "dotool", "clipboard"] | |
| # driver_order = ["wtype", "dotool", "ydotool", "clipboard"] | |
| # Delay between typed characters in milliseconds | |
| # 0 = fastest possible, increase if characters are dropped | |
| type_delay_ms = 0 | |
| # Automatically submit (send Enter key) after outputting transcribed text | |
| # Useful for chat applications, command lines, or forms where you want | |
| # to auto-submit after dictation | |
| # auto_submit = true | |
| # Convert newlines to Shift+Enter instead of regular Enter | |
| # Useful for applications where Enter submits (e.g., Cursor IDE, Slack, Discord) | |
| # shift_enter_newlines = false | |
| # Restore clipboard content after paste mode (default: false) | |
| # Saves clipboard before transcription, restores it after paste keystroke | |
| # Only applies to mode = "paste". Useful when you want to preserve your | |
| # existing clipboard content across dictation operations. | |
| # restore_clipboard = false | |
| # Delay after paste before restoring clipboard (milliseconds) | |
| # Allows time for the paste operation to complete (default: 200) | |
| # restore_clipboard_delay_ms = 200 | |
| # Pre/post output hooks (optional) | |
| # Commands to run before and after typing output. Useful for compositor integration. | |
| # Example: Block modifier keys during typing with Hyprland submap: | |
| # pre_output_command = "hyprctl dispatch submap voxtype_suppress" | |
| # post_output_command = "hyprctl dispatch submap reset" | |
| # See troubleshooting docs for the required Hyprland submap configuration. | |
| # Post-processing command (optional) | |
| # Pipe transcribed text through an external command for cleanup before output. | |
| # The command receives text on stdin and outputs processed text on stdout. | |
| # Useful for LLM-based text cleanup, grammar correction, filler word removal. | |
| # On any failure (timeout, error), falls back to original transcription. | |
| # | |
| # [output.post_process] | |
| # command = "ollama run llama3.2:1b 'Clean up this dictation. Fix grammar, remove filler words. Output only the cleaned text:'" | |
| # timeout_ms = 30000 # 30 second timeout (generous for LLM) | |
| [output.notification] | |
| # Show notification when recording starts (hotkey pressed) | |
| on_recording_start = false | |
| # Show notification when recording stops (transcription beginning) | |
| on_recording_stop = false | |
| # Show notification with transcribed text after transcription completes | |
| on_transcription = true | |
| # [text] | |
| # Text processing options (word replacements, spoken punctuation) | |
| # | |
| # Enable spoken punctuation conversion (e.g., say "period" to get ".") | |
| # spoken_punctuation = false | |
| # | |
| # Custom word replacements (case-insensitive) | |
| # replacements = { "vox type" = "voxtype" } | |
| # [vad] | |
| # Voice Activity Detection - filters silence-only recordings | |
| # Prevents Whisper hallucinations on silent audio | |
| # | |
| # enabled = false # Enable VAD (off by default) | |
| # threshold = 0.5 # 0.0 = sensitive, 1.0 = aggressive | |
| # min_speech_duration_ms = 100 # Minimum speech required | |
| # [status] | |
| # Status display icons for Waybar/tray integrations | |
| # | |
| # Icon theme (or path to custom theme file): | |
| # Font-based (require specific fonts): | |
| # - "emoji" - Default emoji icons (ποΈ π€ β³) | |
| # - "nerd-font" - Nerd Font icons (requires Nerd Font) | |
| # - "material" - Material Design Icons (requires MDI font) | |
| # - "phosphor" - Phosphor Icons (requires Phosphor font) | |
| # - "codicons" - VS Code icons (requires Codicons font) | |
| # - "omarchy" - Omarchy distro icons | |
| # Universal (no special fonts needed): | |
| # - "minimal" - Simple Unicode (β β β Γ) | |
| # - "dots" - Geometric shapes (⯠⬀ β β) | |
| # - "arrows" - Media player style (βΆ β β» β ) | |
| # - "text" - Plain text ([MIC] [REC] [...] [OFF]) | |
| # icon_theme = "emoji" | |
| # | |
| # Per-state icon overrides (optional, takes precedence over theme) | |
| # [status.icons] | |
| # idle = "ποΈ" | |
| # recording = "π€" | |
| # transcribing = "β³" | |
| # stopped = "" | |
| # [profiles] | |
| # Named profiles for context-specific post-processing | |
| # Use with: voxtype record start --profile slack | |
| # | |
| # [profiles.slack] | |
| # post_process_command = "ollama run llama3.2:1b 'Format for Slack...'" | |
| # | |
| # [profiles.code] | |
| # post_process_command = "ollama run llama3.2:1b 'Format as code comment...'" | |
| # output_mode = "clipboard" | |
| #[hotkey] | |
| #enabled = false | |
| #state_file = "auto" |
Author
Author
i3 bindings
bindsym $mod+Shift+v exec ~/.local/bin/voxtype-record.sh start
bindsym --release $mod+Shift+v exec ~/.local/bin/voxtype-record.sh stop
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
~/.local/bin/voxtype-record.sh