Created
November 29, 2025 13:11
-
-
Save LaptopDev/ad051f7005e87c46e9674db172544420 to your computer and use it in GitHub Desktop.
Linux transcription (x11 & wayland) with whisper.cpp and built-in VAD. Records microphone and desktop, and transcribes existing .wav files with arguments.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| # CONFIG | |
| DIR="${HOME}/.transcription" | |
| mkdir -p "$DIR" | |
| WHISPER_BIN="/home/user/source/git/whisper_gpu.cpp/bin/whisper-cli" | |
| WHISPER_MODEL="/home/user/source/git/whisper_gpu.cpp/models/ggml-large-v3.bin" | |
| VAD_MODEL="/home/user/source/git/whisper_gpu.cpp/models/ggml-silero-v5.1.2.bin" | |
| # dependencies | |
| for cmd in ffmpeg ffprobe pactl wpctl notify-send xsel; do | |
| command -v "$cmd" >/dev/null 2>&1 || { echo "Missing dependency: $cmd" >&2; exit 127; } | |
| done | |
| [ -x "$WHISPER_BIN" ] || { echo "Missing whisper binary: $WHISPER_BIN" >&2; exit 127; } | |
| [ -f "$WHISPER_MODEL" ] || { echo "Missing model file: $WHISPER_MODEL" >&2; exit 127; } | |
| notify() { | |
| notify-send "$MOTIVE" "$*" 2>/dev/null || echo "[$MOTIVE] $*" | |
| } | |
| transcribe() { | |
| local file="$1" | |
| [[ -f "$file" ]] || { notify "No file to transcribe: $file"; return; } | |
| notify "Transcribing $(basename "$file")" | |
| local start_time end_time duration audio_len | |
| start_time=$(date +%s) | |
| # Get duration in seconds (integer) | |
| audio_len=$(ffprobe -v error -show_entries format=duration \ | |
| -of default=nw=1:nk=1 "$file" | awk '{printf "%d\n",$1}') | |
| local out_srt | |
| if (( audio_len < 20 )); then | |
| # short: no VAD | |
| out_srt=$("$WHISPER_BIN" \ | |
| -m "$WHISPER_MODEL" \ | |
| -f "$file" | |
| ) | |
| else | |
| # long: VAD | |
| out_srt=$("$WHISPER_BIN" \ | |
| -m "$WHISPER_MODEL" \ | |
| -f "$file" \ | |
| --vad \ | |
| --vad-model "$VAD_MODEL" \ | |
| --vad-threshold 0.6 \ | |
| --vad-min-silence-duration-ms 200 | |
| ) | |
| fi | |
| end_time=$(date +%s) | |
| duration=$((end_time - start_time)) | |
| # remove whisper header | |
| out_srt=$(printf '%s\n' "$out_srt" | sed '1d') | |
| # strip timestamps → TXT | |
| out_txt=$(printf '%s\n' "$out_srt" \ | |
| | perl -pe 's/^\[\d{2}:\d{2}:\d{2}\.\d{3}\s+-->\s+\d{2}:\d{2}:\d{2}\.\d{3}\]\s*//') | |
| local srt_file="${file%.wav}.srt" | |
| local txt_file="${file%.wav}.txt" | |
| printf '%s\n' "$out_srt" > "$srt_file" | |
| printf '%s\n' "$out_txt" > "$txt_file" | |
| # clipboard | |
| if command -v xsel >/dev/null 2>&1; then | |
| printf '%s' "$out_txt" | xsel --clipboard --input | |
| elif command -v xclip >/dev/null 2>&1; then | |
| printf '%s' "$out_txt" | xclip -selection clipboard | |
| fi | |
| notify "Saved transcription: $(basename "$txt_file") (${duration}s)" | |
| } | |
| # --- MODE --- | |
| if [[ "${1:-}" == "--retranscribe" ]]; then | |
| MOTIVE="Transcriber" | |
| shift | |
| if [[ "$#" -lt 1 ]]; then | |
| echo "Usage: $0 --retranscribe file1.wav [file2.wav ...]" | |
| exit 2 | |
| fi | |
| for f in "$@"; do | |
| transcribe "$f" | |
| done | |
| exit 0 | |
| fi | |
| # ORIGINAL RECORDING MODES | |
| mode="${1:-dt}" | |
| case "$mode" in | |
| dt|me|both) ;; | |
| *) echo "Usage: $0 [dt|me|both] OR $0 --retranscribe file.wav [...]"; exit 2;; | |
| esac | |
| MOTIVE="Recorder" | |
| MIC_SRC="$(pactl get-default-source)" | |
| # Fallback logic: wpctl if available, else pactl | |
| if command -v wpctl >/dev/null 2>&1; then | |
| SINK="$(wpctl status | awk '/Default Configured Devices:/{f=1;next} f&&/Audio\/Sink/{print $NF; exit}')" | |
| else | |
| SINK="$(pactl get-default-sink)" | |
| fi | |
| DESK_MON="${SINK:+$SINK.monitor}" | |
| ts=$(date '+%F_%H-%M-%S') | |
| wav_dt="${DIR}/${ts}-dt.wav" | |
| wav_me="${DIR}/${ts}-me.wav" | |
| echo "Mode: $mode — recording; PID will follow" | |
| case "$mode" in | |
| dt) | |
| [[ -n "$DESK_MON" ]] || { notify "No desktop sink"; exit 1; } | |
| notify "Recording desktop audio" "with $DESK_MON" | |
| ffmpeg -hide_banner -loglevel error -f pulse -i "$DESK_MON" \ | |
| -ar 16000 -ac 1 -c:a pcm_s16le "$wav_dt" & | |
| pid=$! | |
| ;; | |
| me) | |
| [[ -n "$MIC_SRC" ]] || { notify "No mic source"; exit 1; } | |
| notify "Recording microphone" "with $MIC_SRC" | |
| ffmpeg -hide_banner -loglevel error -f pulse -i "$MIC_SRC" \ | |
| -ar 16000 -ac 1 -c:a pcm_s16le "$wav_me" & | |
| pid=$! | |
| ;; | |
| both) | |
| [[ -n "$DESK_MON" && -n "$MIC_SRC" ]] || { notify "Missing devices"; exit 1; } | |
| notify "Recording desktop and microphone" "with $DESK_MON and $MIC_SRC" | |
| ffmpeg -hide_banner -loglevel error \ | |
| -f pulse -i "$DESK_MON" -f pulse -i "$MIC_SRC" \ | |
| -map 0:a -ar 16000 -ac 1 -c:a pcm_s16le "$wav_dt" \ | |
| -map 1:a -ar 16000 -ac 1 -c:a pcm_s16le "$wav_me" & | |
| pid=$! | |
| ;; | |
| esac | |
| echo "$pid" | |
| wait "$pid" || true | |
| # --- TRANSCRIBE --- | |
| case "$mode" in | |
| dt) [[ -s "$wav_dt" ]] && transcribe "$wav_dt" || notify "No desktop recording";; | |
| me) [[ -s "$wav_me" ]] && transcribe "$wav_me" || notify "No mic recording";; | |
| both) | |
| [[ -s "$wav_dt" ]] && transcribe "$wav_dt" | |
| [[ -s "$wav_me" ]] && transcribe "$wav_me" | |
| ;; | |
| esac | |
| exit 0 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment