Skip to content

Instantly share code, notes, and snippets.

@nrupatunga
Created April 22, 2026 09:05
Show Gist options
  • Select an option

  • Save nrupatunga/ce4573fb099a8fd31693f52af443cccc to your computer and use it in GitHub Desktop.

Select an option

Save nrupatunga/ce4573fb099a8fd31693f52af443cccc to your computer and use it in GitHub Desktop.
/**
* Whisper Voice Input — minimal pi plugin
*
* Alt+V → starts recording (blinking dot in status bar)
* Enter → stops recording → transcribes → auto-submits (with any attachments)
*
* Editor is NEVER touched during recording — attachments and text are preserved.
*/
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
import { spawn, execSync, type ChildProcess } from "node:child_process";
import { existsSync, mkdtempSync, rmSync } from "node:fs";
import { readFile } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join } from "node:path";
const WHISPER_URL = "https://api.openai.com/v1/audio/transcriptions";
const WHISPER_MODEL = "whisper-1";
async function transcribe(audioPath: string): Promise<string> {
const key = process.env.OPENAI_API_KEY!;
const boundary = `--B${Date.now()}`;
const whisperPrompt = "Concise technical instruction. No filler words.";
const isWav = audioPath.endsWith(".wav");
const filename = isWav ? "audio.wav" : "audio.mp3";
const contentType = isWav ? "audio/wav" : "audio/mpeg";
const field = (name: string, value: string) =>
`--${boundary}\r\nContent-Disposition: form-data; name="${name}"\r\n\r\n${value}\r\n`;
const body = Buffer.concat([
Buffer.from(
`--${boundary}\r\nContent-Disposition: form-data; name="file"; filename="${filename}"\r\nContent-Type: ${contentType}\r\n\r\n`
),
await readFile(audioPath),
Buffer.from(
`\r\n${field("model", WHISPER_MODEL)}` +
`${field("response_format", "json")}` +
`${field("prompt", whisperPrompt)}` +
`${field("temperature", "0")}` +
`--${boundary}--\r\n`
),
]);
const res = await fetch(WHISPER_URL, {
method: "POST",
headers: {
Authorization: `Bearer ${key}`,
"Content-Type": `multipart/form-data; boundary=${boundary}`,
},
body,
});
if (!res.ok) throw new Error(`Whisper API ${res.status}: ${await res.text()}`);
return ((await res.json()) as { text: string }).text.trim();
}
function toMp3(wav: string, mp3: string): string {
try {
execSync(`ffmpeg -y -i "${wav}" -ar 16000 -ac 1 -b:a 64k "${mp3}" 2>/dev/null`, { stdio: "ignore" });
if (existsSync(mp3)) return mp3;
} catch {}
return wav;
}
type MicInfo = { name: string; description: string };
function getMicInfos(): MicInfo[] {
try {
const raw = execSync("pactl list sources", { encoding: "utf8" });
const lines = raw.split("\n");
const out: MicInfo[] = [];
let name = "";
let description = "";
const flush = () => {
if (!name || name.includes("monitor")) {
name = "";
description = "";
return;
}
out.push({ name, description: description || name });
name = "";
description = "";
};
for (const line of lines) {
const t = line.trim();
if (t.startsWith("Source #")) {
flush();
continue;
}
if (t.startsWith("Name:")) {
name = t.slice("Name:".length).trim();
continue;
}
if (t.startsWith("Description:")) {
description = t.slice("Description:".length).trim();
continue;
}
}
flush();
return out;
} catch {
return [];
}
}
export default function (pi: ExtensionAPI) {
let recording = false;
let recProc: ChildProcess | null = null;
let tmpDir: string | null = null;
let animTimer: ReturnType<typeof setInterval> | null = null;
let savedEditorText: string = "";
// Intercept Enter while recording
pi.on("input", async (event, ctx) => {
if (!recording) return { action: "continue" as const };
// Capture images and original editor text
const images = event.images?.length ? [...event.images] : null;
// Stop recording
recording = false;
if (animTimer) { clearInterval(animTimer); animTimer = null; }
recProc?.kill("SIGTERM");
await new Promise((r) => setTimeout(r, 400));
const tmp = tmpDir!;
const wav = join(tmp, "rec.wav");
const mp3 = join(tmp, "rec.mp3");
const origText = savedEditorText;
tmpDir = null;
recProc = null;
savedEditorText = "";
let transcribingTimer: ReturnType<typeof setInterval> | null = null;
try {
if (!existsSync(wav)) {
ctx.ui.notify("No audio recorded", "error");
return { action: "handled" as const };
}
const frames = ["⏳ Transcribing ", "⏳ Transcribing. ", "⏳ Transcribing.. ", "⏳ Transcribing..."];
let f = 0;
ctx.ui.setEditorText(frames[0]);
transcribingTimer = setInterval(() => {
ctx.ui.setEditorText(frames[f % frames.length]);
f++;
}, 180);
const text = await transcribe(toMp3(wav, mp3));
if (transcribingTimer) {
clearInterval(transcribingTimer);
transcribingTimer = null;
}
if (!text) {
ctx.ui.setEditorText(origText || "");
ctx.ui.notify("Empty transcription", "warning");
} else {
// Combine: original editor text + transcribed voice
const fullText = origText ? `${origText} ${text}` : text;
// Replace transcribing animation with recognized text
ctx.ui.setEditorText(fullText);
await new Promise((r) => setTimeout(r, 300));
// Submit with images if any
if (images?.length) {
const content: any[] = [{ type: "text", text: fullText }];
content.push(...images);
pi.sendUserMessage(content);
} else {
pi.sendUserMessage(fullText);
}
ctx.ui.setEditorText("");
}
} catch (e: any) {
if (transcribingTimer) clearInterval(transcribingTimer);
ctx.ui.setEditorText(origText || "");
ctx.ui.notify(e.message, "error");
} finally {
ctx.ui.setStatus("voice", undefined);
rmSync(tmp, { recursive: true, force: true });
}
return { action: "handled" as const };
});
function startRecording(ctx: any) {
if (recording) return;
if (!process.env.OPENAI_API_KEY) {
ctx.ui.notify("OPENAI_API_KEY not set", "error");
return;
}
// Save whatever text is in the editor
savedEditorText = (ctx.ui.getEditorText?.() || "").trim();
tmpDir = mkdtempSync(join(tmpdir(), "piv-"));
const wav = join(tmpDir, "rec.wav");
// Pick mic: WHISPER_MIC (exact or substring) > default source > bluetooth source > first source
const recArgs = ["--format=s16le", "--rate=16000", "--channels=1", "--file-format=wav"];
let selectedMicLabel = "default";
try {
const mics = getMicInfos();
const inputs = mics.map((m) => m.name);
const defaultSrc = execSync("pactl get-default-source", { encoding: "utf8" }).trim();
const envMicRaw = (process.env.WHISPER_MIC || "").trim();
const envMic = envMicRaw.toLowerCase();
let device: string | undefined;
if (envMic) {
device =
inputs.find((d) => d === envMicRaw) ||
inputs.find((d) => d.toLowerCase().includes(envMic)) ||
mics.find((m) => m.description.toLowerCase().includes(envMic))?.name;
}
if (!device && inputs.includes(defaultSrc)) device = defaultSrc;
if (!device) device = inputs.find((d) => d.includes("bluez_source") || d.includes("bluez_input"));
if (!device) device = inputs[0];
if (device) {
recArgs.unshift(`--device=${device}`);
selectedMicLabel = mics.find((m) => m.name === device)?.description || device;
}
} catch {}
recArgs.push(wav);
ctx.ui.notify(`Using mic: ${selectedMicLabel}`, "info");
recProc = spawn("parecord", recArgs, { stdio: "ignore" });
recording = true;
const dots = ["●", "○"];
let tick = 0;
if (!savedEditorText) {
// Empty editor — animate in prompt
ctx.ui.setEditorText("● REC (Enter to stop)");
animTimer = setInterval(() => {
ctx.ui.setEditorText(`${dots[tick % 2]} REC (Enter to stop)`);
tick++;
}, 350);
} else {
// Has content/attachments — animate in status bar only
ctx.ui.setStatus("voice", "● REC (Enter to stop)");
animTimer = setInterval(() => {
ctx.ui.setStatus("voice", `${dots[tick % 2]} REC (Enter to stop)`);
tick++;
}, 350);
}
}
pi.registerCommand("voice", {
description: "Voice -> Whisper -> auto-submit",
handler: (_, ctx) => startRecording(ctx),
});
pi.registerCommand("voice-mics", {
description: "Show available microphone names (friendly + internal)",
handler: (_, ctx) => {
const mics = getMicInfos();
if (!mics.length) {
ctx.ui.notify("No microphones detected", "warning");
return;
}
let def = "";
try { def = execSync("pactl get-default-source", { encoding: "utf8" }).trim(); } catch {}
const rows = mics.map((m) => `${m.name === def ? "*" : " "} ${m.description} [${m.name}]`);
ctx.ui.notify(`Mics:\n${rows.join("\n")}`, "info");
},
});
pi.registerShortcut("alt+v", {
description: "Voice input",
handler: (ctx) => startRecording(ctx),
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment