Add voice dictation with voxtype (#4088)

* Try with voxtype * Update delay to prevent skipped characters * Add removal of voxtype * Use -bin package * Fix for yay * Nerdicons! * Use new, subtle nerdfont glyphs instead of standard icons * Do this in the waybar config instead * Make voxtype a permanent fixture so it is not lost on waybar resets * Record purpose * Add Dictation hotkeys * Tweak wording and point to config * Reuse the same help * Better communication * Anticipate moving the package to OPR * Clarify where the packages are coming from * input group not needed when using hyprland hotkey * Explain hotkey off * Allow for changing of the model on right click * Allow config on right click * Be more specific * Focus on config editing with waybar clicks --------- Co-authored-by: Ryan Hughes <ryan@heyoodle.com>
2026-02-17 15:25:37 +00:00 · 2026-01-06 13:00:19 +01:00
parent e3cd567f6f
commit 0d42f1bafe
12 changed files with 263 additions and 6 deletions
--- a/default/hypr/bindings/utilities.conf
+++ b/default/hypr/bindings/utilities.conf
@@ -49,4 +49,8 @@ bindd = SUPER CTRL ALT, B, Show battery remaining, exec, notify-send "󰁹    Ba
 bindd = SUPER CTRL, A, Audio controls, exec, omarchy-launch-audio
 bindd = SUPER CTRL, B, Bluetooth controls, exec, omarchy-launch-bluetooth
 bindd = SUPER CTRL, W, Wifi controls, exec, omarchy-launch-wifi
-bindd = SUPER CTRL, T, Activity, exec, omarchy-launch-tui btop
+bindd = SUPER CTRL, T, Activity, exec, omarchy-launch-tui btop
+
+# Dictation
+bindd  = SUPER CTRL, X, Start dictation, exec, voxtype record start
+binddr = SUPER CTRL, X, Stop dictation, exec, voxtype record stop
--- a/default/voxtype/config.toml
+++ b/default/voxtype/config.toml
@@ -0,0 +1,97 @@
+# Voxtype Configuration
+#
+# Location: ~/.config/voxtype/config.toml
+# All settings can be overridden via CLI flags
+#
+# State file for external integrations (Waybar, polybar, etc.)
+# Use "auto" for default location ($XDG_RUNTIME_DIR/voxtype/state),
+# a custom path, or "disabled" to turn off. The daemon writes state
+# ("idle", "recording", "transcribing") to this file whenever it changes.
+# Required for `voxtype record toggle` and `voxtype status` commands.
+state_file = "auto"
+
+[hotkey]
+# Hotkey is configured in Hyprland. Default is Super + Ctrl + X
+enabled = false
+
+[audio]
+# Audio input device ("default" uses system default)
+# List devices with: pactl list sources short
+device = "default"
+
+# Sample rate in Hz (whisper expects 16000)
+sample_rate = 16000
+
+# Maximum recording duration in seconds (safety limit)
+max_duration_secs = 60
+
+# [audio.feedback]
+# Enable audio feedback sounds (beeps when recording starts/stops)
+# enabled = true
+#
+# Sound theme: "default", "subtle", "mechanical", or path to custom theme directory
+# theme = "default"
+#
+# Volume level (0.0 to 1.0)
+# volume = 0.7
+
+[whisper]
+# Model to use for transcription
+# Options: tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v3, large-v3-turbo
+# .en models are English-only but faster and more accurate for English
+# large-v3-turbo is faster than large-v3 with minimal accuracy loss (recommended for GPU)
+# Or provide absolute path to a custom .bin model file
+model = "base.en"
+
+# Language for transcription
+# Use "en" for English, "auto" for auto-detection
+# See: https://github.com/openai/whisper#available-models-and-languages
+language = "en"
+
+# Translate non-English speech to English
+translate = false
+
+# Number of CPU threads for inference (omit for auto-detection)
+# threads = 4
+
+[output]
+# Primary output mode: "type" or "clipboard"
+# - type: Simulates keyboard input at cursor position (requires ydotool)
+# - clipboard: Copies text to clipboard (requires wl-copy)
+mode = "type"
+
+# Fall back to clipboard if typing fails
+fallback_to_clipboard = true
+
+# Delay between typed characters in milliseconds
+# 0 = fastest possible, increase if characters are dropped
+type_delay_ms = 1
+
+# Post-processing command (optional)
+# Pipe transcribed text through an external command for cleanup before output.
+# The command receives text on stdin and outputs processed text on stdout.
+# Useful for LLM-based text cleanup, grammar correction, filler word removal.
+# On any failure (timeout, error), falls back to original transcription.
+#
+# [output.post_process]
+# command = "ollama run llama3.2:1b 'Clean up this dictation. Fix grammar, remove filler words. Output only the cleaned text:'"
+# timeout_ms = 30000  # 30 second timeout (generous for LLM)
+
+[output.notification]
+# Show notification when recording starts (hotkey pressed)
+on_recording_start = false
+
+# Show notification when recording stops (transcription beginning)
+on_recording_stop = false
+
+# Show notification with transcribed text after transcription completes
+on_transcription = false
+
+# [text]
+# Text processing options (word replacements, spoken punctuation)
+#
+# Enable spoken punctuation conversion (e.g., say "period" to get ".")
+# spoken_punctuation = false
+#
+# Custom word replacements (case-insensitive)
+# replacements = { "hyperwhisper" = "hyprwhspr" }