Skip to content

Instantly share code, notes, and snippets.

@LaptopDev
Created November 29, 2025 13:11
Show Gist options
  • Select an option

  • Save LaptopDev/ad051f7005e87c46e9674db172544420 to your computer and use it in GitHub Desktop.

Select an option

Save LaptopDev/ad051f7005e87c46e9674db172544420 to your computer and use it in GitHub Desktop.
Linux transcription (x11 & wayland) with whisper.cpp and built-in VAD. Records microphone and desktop, and transcribes existing .wav files with arguments.
#!/usr/bin/env bash
set -euo pipefail
# CONFIG
DIR="${HOME}/.transcription"
mkdir -p "$DIR"
WHISPER_BIN="/home/user/source/git/whisper_gpu.cpp/bin/whisper-cli"
WHISPER_MODEL="/home/user/source/git/whisper_gpu.cpp/models/ggml-large-v3.bin"
VAD_MODEL="/home/user/source/git/whisper_gpu.cpp/models/ggml-silero-v5.1.2.bin"
# dependencies
for cmd in ffmpeg ffprobe pactl wpctl notify-send xsel; do
command -v "$cmd" >/dev/null 2>&1 || { echo "Missing dependency: $cmd" >&2; exit 127; }
done
[ -x "$WHISPER_BIN" ] || { echo "Missing whisper binary: $WHISPER_BIN" >&2; exit 127; }
[ -f "$WHISPER_MODEL" ] || { echo "Missing model file: $WHISPER_MODEL" >&2; exit 127; }
notify() {
notify-send "$MOTIVE" "$*" 2>/dev/null || echo "[$MOTIVE] $*"
}
transcribe() {
local file="$1"
[[ -f "$file" ]] || { notify "No file to transcribe: $file"; return; }
notify "Transcribing $(basename "$file")"
local start_time end_time duration audio_len
start_time=$(date +%s)
# Get duration in seconds (integer)
audio_len=$(ffprobe -v error -show_entries format=duration \
-of default=nw=1:nk=1 "$file" | awk '{printf "%d\n",$1}')
local out_srt
if (( audio_len < 20 )); then
# short: no VAD
out_srt=$("$WHISPER_BIN" \
-m "$WHISPER_MODEL" \
-f "$file"
)
else
# long: VAD
out_srt=$("$WHISPER_BIN" \
-m "$WHISPER_MODEL" \
-f "$file" \
--vad \
--vad-model "$VAD_MODEL" \
--vad-threshold 0.6 \
--vad-min-silence-duration-ms 200
)
fi
end_time=$(date +%s)
duration=$((end_time - start_time))
# remove whisper header
out_srt=$(printf '%s\n' "$out_srt" | sed '1d')
# strip timestamps → TXT
out_txt=$(printf '%s\n' "$out_srt" \
| perl -pe 's/^\[\d{2}:\d{2}:\d{2}\.\d{3}\s+-->\s+\d{2}:\d{2}:\d{2}\.\d{3}\]\s*//')
local srt_file="${file%.wav}.srt"
local txt_file="${file%.wav}.txt"
printf '%s\n' "$out_srt" > "$srt_file"
printf '%s\n' "$out_txt" > "$txt_file"
# clipboard
if command -v xsel >/dev/null 2>&1; then
printf '%s' "$out_txt" | xsel --clipboard --input
elif command -v xclip >/dev/null 2>&1; then
printf '%s' "$out_txt" | xclip -selection clipboard
fi
notify "Saved transcription: $(basename "$txt_file") (${duration}s)"
}
# --- MODE ---
if [[ "${1:-}" == "--retranscribe" ]]; then
MOTIVE="Transcriber"
shift
if [[ "$#" -lt 1 ]]; then
echo "Usage: $0 --retranscribe file1.wav [file2.wav ...]"
exit 2
fi
for f in "$@"; do
transcribe "$f"
done
exit 0
fi
# ORIGINAL RECORDING MODES
mode="${1:-dt}"
case "$mode" in
dt|me|both) ;;
*) echo "Usage: $0 [dt|me|both] OR $0 --retranscribe file.wav [...]"; exit 2;;
esac
MOTIVE="Recorder"
MIC_SRC="$(pactl get-default-source)"
# Fallback logic: wpctl if available, else pactl
if command -v wpctl >/dev/null 2>&1; then
SINK="$(wpctl status | awk '/Default Configured Devices:/{f=1;next} f&&/Audio\/Sink/{print $NF; exit}')"
else
SINK="$(pactl get-default-sink)"
fi
DESK_MON="${SINK:+$SINK.monitor}"
ts=$(date '+%F_%H-%M-%S')
wav_dt="${DIR}/${ts}-dt.wav"
wav_me="${DIR}/${ts}-me.wav"
echo "Mode: $mode — recording; PID will follow"
case "$mode" in
dt)
[[ -n "$DESK_MON" ]] || { notify "No desktop sink"; exit 1; }
notify "Recording desktop audio" "with $DESK_MON"
ffmpeg -hide_banner -loglevel error -f pulse -i "$DESK_MON" \
-ar 16000 -ac 1 -c:a pcm_s16le "$wav_dt" &
pid=$!
;;
me)
[[ -n "$MIC_SRC" ]] || { notify "No mic source"; exit 1; }
notify "Recording microphone" "with $MIC_SRC"
ffmpeg -hide_banner -loglevel error -f pulse -i "$MIC_SRC" \
-ar 16000 -ac 1 -c:a pcm_s16le "$wav_me" &
pid=$!
;;
both)
[[ -n "$DESK_MON" && -n "$MIC_SRC" ]] || { notify "Missing devices"; exit 1; }
notify "Recording desktop and microphone" "with $DESK_MON and $MIC_SRC"
ffmpeg -hide_banner -loglevel error \
-f pulse -i "$DESK_MON" -f pulse -i "$MIC_SRC" \
-map 0:a -ar 16000 -ac 1 -c:a pcm_s16le "$wav_dt" \
-map 1:a -ar 16000 -ac 1 -c:a pcm_s16le "$wav_me" &
pid=$!
;;
esac
echo "$pid"
wait "$pid" || true
# --- TRANSCRIBE ---
case "$mode" in
dt) [[ -s "$wav_dt" ]] && transcribe "$wav_dt" || notify "No desktop recording";;
me) [[ -s "$wav_me" ]] && transcribe "$wav_me" || notify "No mic recording";;
both)
[[ -s "$wav_dt" ]] && transcribe "$wav_dt"
[[ -s "$wav_me" ]] && transcribe "$wav_me"
;;
esac
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment