Evan-Kim2028 · April 7, 2026 14:39
diff --git a/gistfile0.txt b/gistfile0.txt
 # Local Push-to-Talk Dictation on Linux (System76 / Pop!_OS)

 Uses [faster-whisper](https://github.com/SYSTRAN/faster-whisper) + your NVIDIA GPU for local, private speech-to-text.
 Hold **Right Alt** to record, release to transcribe and type into any focused window.

 Tested on: System76 laptop, Pop!_OS 22.04, NVIDIA RTX 4050, GNOME on X11.

 ---

 ## Prerequisites

 ```bash
 # These should already be present on Pop!_OS:
 which arecord      # from alsa-utils
 which xdotool      # sudo apt install xdotool  (if missing)
 which uv           # https://docs.astral.sh/uv/getting-started/installation/
 ```

 ---

 ## 1. Create a Python 3.10 venv

 > **Why 3.10?** The system default on Pop!_OS may be Python 3.14, which has
 > ctranslate2 wheel compatibility issues. Python 3.10 is stable for this stack.

 ```bash
 mkdir -p ~/.local/share/whisper-dictation ~/.local/bin
 uv venv ~/.local/share/whisper-dictation/venv --python 3.10
 ```

 ---

 ## 2. Install packages

 ```bash
 # Core packages
 uv pip install --python ~/.local/share/whisper-dictation/venv/bin/python \
  faster-whisper pynput

 # CUDA 12 runtime libraries (needed even if your driver reports CUDA 13.x)
 # The driver ships separately from the runtime — these fill the gap.
 uv pip install --python ~/.local/share/whisper-dictation/venv/bin/python \
  nvidia-cublas-cu12 nvidia-cudnn-cu12
 ```

 ---

 ## 3. Create the dictation script

 Save to `~/.local/bin/whisper-dictation` and make it executable:

 ```bash
 chmod +x ~/.local/bin/whisper-dictation
 ```

 > **Why not pynput?**  
 > pynput uses X11's `XRecord` extension, which on Pop!_OS/GNOME only captures
 > synthetic key events (e.g. from xdotool), not physical keypresses from your
 > keyboard. The script below uses `XGrabKey` via python-xlib (the same
 > mechanism GNOME uses for its own shortcuts), which always captures physical
 > keys.

 ```python
 #!/home/YOUR_USERNAME/.local/share/whisper-dictation/venv/bin/python
 """
 whisper-dictation: Push-to-talk dictation using faster-whisper + GPU.
 Hold Right Alt to record, release to transcribe and type into focused window.
 """

 import os
 import subprocess
 import tempfile
 import threading

 from faster_whisper import WhisperModel
 from Xlib import X, display as xdisplay

 # --- Config ---
 MODEL_SIZE = "medium"   # tiny / base / small / medium / large-v3
 DEVICE = "cuda"
 COMPUTE_TYPE = "float16"
 # Right Alt / AltGr — grabs both keysyms to cover different keyboard layouts
 GRAB_KEYSYMS = [0xfe03, 0xffea]  # ISO_Level3_Shift, Alt_R
 # --------------


 def notify(summary, body="", urgency="normal"):
    try:
        subprocess.Popen(
            ["notify-send", "-a", "whisper-dictation", "-u", urgency, summary, body],
            stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
        )
    except FileNotFoundError:
        pass


 print(f"Loading whisper model ({MODEL_SIZE} on {DEVICE})...", flush=True)
 notify("Whisper Dictation", f"Loading {MODEL_SIZE} model...")
 model = WhisperModel(MODEL_SIZE, device=DEVICE, compute_type=COMPUTE_TYPE)
 print("Ready. Hold Right Alt to dictate.", flush=True)
 notify("Whisper Dictation", "Ready — hold Right Alt to dictate.")

 _lock = threading.Lock()
 _recording_proc = None
 _recording_file = None
 _is_recording = False


 def _start_recording():
    global _recording_proc, _recording_file, _is_recording
    with _lock:
        if _is_recording:
            return
        _is_recording = True
        _recording_file = tempfile.mktemp(suffix=".wav")
        _recording_proc = subprocess.Popen(
            ["arecord", "-f", "S16_LE", "-r", "16000", "-c", "1", _recording_file],
            stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
        )
    print("[recording]", flush=True)
    notify("Recording...", urgency="low")


 def _stop_and_transcribe():
    global _recording_proc, _recording_file, _is_recording
    with _lock:
        if not _is_recording:
            return
        _is_recording = False
        proc, wav = _recording_proc, _recording_file
        _recording_proc = None
        _recording_file = None

    if proc:
        proc.terminate()
        proc.wait()

    if wav and os.path.exists(wav):
        try:
            segments, _ = model.transcribe(wav, beam_size=5)
            text = " ".join(seg.text for seg in segments).strip()
            print(f"[result] {text!r}", flush=True)
            if text:
                subprocess.run(
                    ["xdotool", "type", "--clearmodifiers", "--", text],
                    check=False,
                )
        finally:
            try:
                os.unlink(wav)
            except OSError:
                pass


 # --- X11 key grab loop ---
 d = xdisplay.Display()
 root = d.screen().root

 grabbed_keycodes = set()
 for keysym in GRAB_KEYSYMS:
    kc = d.keysym_to_keycode(keysym)
    if kc and kc not in grabbed_keycodes:
        root.grab_key(kc, X.AnyModifier, True, X.GrabModeAsync, X.GrabModeAsync)
        grabbed_keycodes.add(kc)

 d.sync()

 while True:
    event = d.next_event()
    if event.type == X.KeyPress and event.detail in grabbed_keycodes:
        threading.Thread(target=_start_recording, daemon=True).start()
    elif event.type == X.KeyRelease and event.detail in grabbed_keycodes:
        # Filter out X11 auto-repeat (sends Release+Press pairs for held keys)
        if d.pending_events():
            peek = d.next_event()
            if peek.type == X.KeyPress and peek.detail == event.detail:
                continue  # auto-repeat, ignore
            else:
                threading.Thread(target=_stop_and_transcribe, daemon=True).start()
                if peek.type == X.KeyPress and peek.detail in grabbed_keycodes:
                    threading.Thread(target=_start_recording, daemon=True).start()
        else:
            threading.Thread(target=_stop_and_transcribe, daemon=True).start()
 ```

 ---

 ## 4. Create the systemd user service

 Save to `~/.config/systemd/user/whisper-dictation.service`:

 ```ini
 [Unit]
 Description=Whisper Dictation (push-to-talk speech recognition)
 After=graphical-session.target
 PartOf=graphical-session.target

 [Service]
 Type=simple
 ExecStart=/home/YOUR_USERNAME/.local/bin/whisper-dictation
 Restart=on-failure
 RestartSec=5
 # Point ctranslate2 at the pip-installed CUDA 12 runtime .so files
 Environment=LD_LIBRARY_PATH=/home/YOUR_USERNAME/.local/share/whisper-dictation/venv/lib/python3.10/site-packages/nvidia/cublas/lib:/home/YOUR_USERNAME/.local/share/whisper-dictation/venv/lib/python3.10/site-packages/nvidia/cudnn/lib

 [Install]
 WantedBy=graphical-session.target
 ```

 > Replace `YOUR_USERNAME` with your actual username (`echo $USER`).

 ---

 ## 5. Enable and start

 ```bash
 systemctl --user daemon-reload
 systemctl --user enable --now whisper-dictation.service
 ```

 The first start downloads the model (~500 MB for `medium`). Watch progress with:

 ```bash
 journalctl --user -u whisper-dictation -f
 ```

 You'll see `Ready. Hold Right Alt to dictate.` and get a desktop notification when it's ready.

 ---

 ## Usage

 - **Hold Right Alt** → recording starts
 - **Release Right Alt** → transcribes and types text into the focused window
 - Works in any app: browser, terminal, text editor, Slack, etc.

 ---

 ## Troubleshooting

 | Symptom | Fix |
 |---------|-----|
 | `libcublas.so.12 not found` | Run the `nvidia-cublas-cu12` install step and add `LD_LIBRARY_PATH` to the service |
 | Key press not detected | Make sure you're on X11 (`echo $XDG_SESSION_TYPE`), not Wayland |
 | Empty transcription | Check microphone: `arecord -f S16_LE -r 16000 -c 1 test.wav` then play it back |
 | High VRAM usage | Change `MODEL_SIZE = "small"` or `"base"` in the script |

 ## Model size guide

 | Model | VRAM | Notes |
 |-------|------|-------|
 | `tiny` | ~200 MB | Fastest, lower accuracy |
 | `base` | ~300 MB | Good for clear speech |
 | `small` | ~600 MB | Good balance |
 | `medium` | ~1.5 GB | Recommended |
 | `large-v3` | ~3–4 GB | Best accuracy |
	# Local Push-to-Talk Dictation on Linux (System76 / Pop!_OS)

	Uses [faster-whisper](https://github.com/SYSTRAN/faster-whisper) + your NVIDIA GPU for local, private speech-to-text.
	Hold Right Alt to record, release to transcribe and type into any focused window.

	Tested on: System76 laptop, Pop!_OS 22.04, NVIDIA RTX 4050, GNOME on X11.

	---

	## Prerequisites

	```bash
	# These should already be present on Pop!_OS:
	which arecord # from alsa-utils
	which xdotool # sudo apt install xdotool (if missing)
	which uv # https://docs.astral.sh/uv/getting-started/installation/
	```

	---

	## 1. Create a Python 3.10 venv

	> Why 3.10? The system default on Pop!_OS may be Python 3.14, which has
	> ctranslate2 wheel compatibility issues. Python 3.10 is stable for this stack.

	```bash
	mkdir -p ~/.local/share/whisper-dictation ~/.local/bin
	uv venv ~/.local/share/whisper-dictation/venv --python 3.10
	```

	---

	## 2. Install packages

	```bash
	# Core packages
	uv pip install --python ~/.local/share/whisper-dictation/venv/bin/python \
	faster-whisper pynput

	# CUDA 12 runtime libraries (needed even if your driver reports CUDA 13.x)
	# The driver ships separately from the runtime — these fill the gap.
	uv pip install --python ~/.local/share/whisper-dictation/venv/bin/python \
	nvidia-cublas-cu12 nvidia-cudnn-cu12
	```

	---

	## 3. Create the dictation script

	Save to `~/.local/bin/whisper-dictation` and make it executable:

	```bash
	chmod +x ~/.local/bin/whisper-dictation
	```

	> Why not pynput?
	> pynput uses X11's `XRecord` extension, which on Pop!_OS/GNOME only captures
	> synthetic key events (e.g. from xdotool), not physical keypresses from your
	> keyboard. The script below uses `XGrabKey` via python-xlib (the same
	> mechanism GNOME uses for its own shortcuts), which always captures physical
	> keys.

	```python
	#!/home/YOUR_USERNAME/.local/share/whisper-dictation/venv/bin/python
	"""
	whisper-dictation: Push-to-talk dictation using faster-whisper + GPU.
	Hold Right Alt to record, release to transcribe and type into focused window.
	"""

	import os
	import subprocess
	import tempfile
	import threading

	from faster_whisper import WhisperModel
	from Xlib import X, display as xdisplay

	# --- Config ---
	MODEL_SIZE = "medium" # tiny / base / small / medium / large-v3
	DEVICE = "cuda"
	COMPUTE_TYPE = "float16"
	# Right Alt / AltGr — grabs both keysyms to cover different keyboard layouts
	GRAB_KEYSYMS = [0xfe03, 0xffea] # ISO_Level3_Shift, Alt_R
	# --------------


	def notify(summary, body="", urgency="normal"):
	try:
	subprocess.Popen(
	["notify-send", "-a", "whisper-dictation", "-u", urgency, summary, body],
	stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
	)
	except FileNotFoundError:
	pass


	print(f"Loading whisper model ({MODEL_SIZE} on {DEVICE})...", flush=True)
	notify("Whisper Dictation", f"Loading {MODEL_SIZE} model...")
	model = WhisperModel(MODEL_SIZE, device=DEVICE, compute_type=COMPUTE_TYPE)
	print("Ready. Hold Right Alt to dictate.", flush=True)
	notify("Whisper Dictation", "Ready — hold Right Alt to dictate.")

	_lock = threading.Lock()
	_recording_proc = None
	_recording_file = None
	_is_recording = False


	def _start_recording():
	global _recording_proc, _recording_file, _is_recording
	with _lock:
	if _is_recording:
	return
	_is_recording = True
	_recording_file = tempfile.mktemp(suffix=".wav")
	_recording_proc = subprocess.Popen(
	["arecord", "-f", "S16_LE", "-r", "16000", "-c", "1", _recording_file],
	stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
	)
	print("[recording]", flush=True)
	notify("Recording...", urgency="low")


	def _stop_and_transcribe():
	global _recording_proc, _recording_file, _is_recording
	with _lock:
	if not _is_recording:
	return
	_is_recording = False
	proc, wav = _recording_proc, _recording_file
	_recording_proc = None
	_recording_file = None

	if proc:
	proc.terminate()
	proc.wait()

	if wav and os.path.exists(wav):
	try:
	segments, _ = model.transcribe(wav, beam_size=5)
	text = " ".join(seg.text for seg in segments).strip()
	print(f"[result] {text!r}", flush=True)
	if text:
	subprocess.run(
	["xdotool", "type", "--clearmodifiers", "--", text],
	check=False,
	)
	finally:
	try:
	os.unlink(wav)
	except OSError:
	pass


	# --- X11 key grab loop ---
	d = xdisplay.Display()
	root = d.screen().root

	grabbed_keycodes = set()
	for keysym in GRAB_KEYSYMS:
	kc = d.keysym_to_keycode(keysym)
	if kc and kc not in grabbed_keycodes:
	root.grab_key(kc, X.AnyModifier, True, X.GrabModeAsync, X.GrabModeAsync)
	grabbed_keycodes.add(kc)

	d.sync()

	while True:
	event = d.next_event()
	if event.type == X.KeyPress and event.detail in grabbed_keycodes:
	threading.Thread(target=_start_recording, daemon=True).start()
	elif event.type == X.KeyRelease and event.detail in grabbed_keycodes:
	# Filter out X11 auto-repeat (sends Release+Press pairs for held keys)
	if d.pending_events():
	peek = d.next_event()
	if peek.type == X.KeyPress and peek.detail == event.detail:
	continue # auto-repeat, ignore
	else:
	threading.Thread(target=_stop_and_transcribe, daemon=True).start()
	if peek.type == X.KeyPress and peek.detail in grabbed_keycodes:
	threading.Thread(target=_start_recording, daemon=True).start()
	else:
	threading.Thread(target=_stop_and_transcribe, daemon=True).start()
	```

	---

	## 4. Create the systemd user service

	Save to `~/.config/systemd/user/whisper-dictation.service`:

	```ini
	[Unit]
	Description=Whisper Dictation (push-to-talk speech recognition)
	After=graphical-session.target
	PartOf=graphical-session.target

	[Service]
	Type=simple
	ExecStart=/home/YOUR_USERNAME/.local/bin/whisper-dictation
	Restart=on-failure
	RestartSec=5
	# Point ctranslate2 at the pip-installed CUDA 12 runtime .so files
	Environment=LD_LIBRARY_PATH=/home/YOUR_USERNAME/.local/share/whisper-dictation/venv/lib/python3.10/site-packages/nvidia/cublas/lib:/home/YOUR_USERNAME/.local/share/whisper-dictation/venv/lib/python3.10/site-packages/nvidia/cudnn/lib

	[Install]
	WantedBy=graphical-session.target
	```

	> Replace `YOUR_USERNAME` with your actual username (`echo $USER`).

	---

	## 5. Enable and start

	```bash
	systemctl --user daemon-reload
	systemctl --user enable --now whisper-dictation.service
	```

	The first start downloads the model (~500 MB for `medium`). Watch progress with:

	```bash
	journalctl --user -u whisper-dictation -f
	```

	You'll see `Ready. Hold Right Alt to dictate.` and get a desktop notification when it's ready.

	---

	## Usage

	- Hold Right Alt → recording starts
	- Release Right Alt → transcribes and types text into the focused window
	- Works in any app: browser, terminal, text editor, Slack, etc.

	---

	## Troubleshooting

	\| Symptom \| Fix \|
	\|---------\|-----\|
	\| `libcublas.so.12 not found` \| Run the `nvidia-cublas-cu12` install step and add `LD_LIBRARY_PATH` to the service \|
	\| Key press not detected \| Make sure you're on X11 (`echo $XDG_SESSION_TYPE`), not Wayland \|
	\| Empty transcription \| Check microphone: `arecord -f S16_LE -r 16000 -c 1 test.wav` then play it back \|
	\| High VRAM usage \| Change `MODEL_SIZE = "small"` or `"base"` in the script \|

	## Model size guide

	\| Model \| VRAM \| Notes \|
	\|-------\|------\|-------\|
	\| `tiny` \| ~200 MB \| Fastest, lower accuracy \|
	\| `base` \| ~300 MB \| Good for clear speech \|
	\| `small` \| ~600 MB \| Good balance \|
	\| `medium` \| ~1.5 GB \| Recommended \|
	\| `large-v3` \| ~3–4 GB \| Best accuracy \|
No results found