Created
July 26, 2024 19:12
-
-
Save rozkminiacz/2d8d11779bc7d3b701d625aede9400cb to your computer and use it in GitHub Desktop.
Whisper long file transcription + GPT4 postprocessing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Check if an input file was provided | |
if [ "$#" -ne 1 ]; then | |
echo "Usage: $0 <input_audio_file>" | |
exit 1 | |
fi | |
# Variables | |
INPUT_FILE=$1 | |
BASENAME=$(basename "$INPUT_FILE" | sed 's/\(.*\)\..*/\1/') | |
OUTPUT_DIR="chunks" | |
WHISPER_API_URL="https://api.openai.com/v1/audio/transcriptions" # Whisper API URL | |
API_KEY="sk-XYZ" # Whisper API key | |
MODEL="whisper-1" # Set the model parameter | |
POST_PROCESS_API_URL="https://api.openai.com/v1/chat/completions" # Post-processing API URL | |
SYSTEM_PROMPT="You are a helpful transcription assistant. Your task is to correct any spelling discrepancies and clean up the transcribed text. Remove filler transcription such as Napisy przygotowane przez XYZ. Only add necessary punctuation such as periods, commas, and capitalization, and use only the context provided." | |
# Create output directory if it doesn't exist | |
mkdir -p $OUTPUT_DIR | |
# Step 1: Split the audio file into 20MB chunks | |
ffmpeg -i $INPUT_FILE -f segment -segment_time 120 -c copy $OUTPUT_DIR/output%03d.wav | |
# Create a temporary file for the transcript | |
TRANSCRIPT_FILE=$(mktemp) | |
# Initialize an empty string for the context | |
context="" | |
# Step 2: Transcribe each chunk and merge transcriptions | |
for chunk in $OUTPUT_DIR/*.wav; do | |
echo "Processing $chunk..." >&2 | |
# Step 2a: Prepare the prompt with the context | |
prompt="\n\nTranscript so far:\n$context\n\n" | |
# Step 2b: Send chunk to Whisper API with the context prompt | |
response=$(curl -s -X POST -H "Authorization: Bearer $API_KEY" -H "Content-Type: multipart/form-data" \ | |
-F "file=@${chunk}" -F "model=$MODEL" -F "prompt=$prompt" $WHISPER_API_URL) | |
# Step 2c: Extract transcription text from response | |
transcription=$(echo $response | jq -r '.text') | |
echo $response | |
# Step 2d: Append transcription to the final transcription file | |
echo "$transcription" >> $TRANSCRIPT_FILE | |
# Update context with the current segment's transcription | |
context="$context\n$transcription" | |
done | |
# Step 3: Post-process the full transcription text | |
full_transcription=$(cat $TRANSCRIPT_FILE) | |
# Properly escape the system prompt and transcription text for JSON | |
SYSTEM_PROMPT_ESCAPED=$(echo "$SYSTEM_PROMPT" | jq -sRr @json) | |
FULL_TRANSCRIPTION_ESCAPED=$(echo "$full_transcription" | jq -sRr @json) | |
# Create the JSON payload | |
read -r -d '' JSON_PAYLOAD <<EOF | |
{ | |
"model": "gpt-4o", | |
"temperature": 0, | |
"messages": [ | |
{"role": "system", "content": $SYSTEM_PROMPT_ESCAPED}, | |
{"role": "user", "content": $FULL_TRANSCRIPTION_ESCAPED} | |
] | |
} | |
EOF | |
# Send the post-processing request | |
response=$(curl -s -X POST -H "Authorization: Bearer $API_KEY" -H "Content-Type: application/json" \ | |
-d "$JSON_PAYLOAD" $POST_PROCESS_API_URL) | |
echo $response | |
# Extract the corrected text from the response | |
corrected_text=$(echo $response | jq -r '.choices[0].message.content') | |
# Output the corrected transcription to a .md file | |
OUTPUT_FILE="${BASENAME}.md" | |
echo "$corrected_text" > "$OUTPUT_FILE" | |
echo "Corrected transcription saved to $OUTPUT_FILE" | |
# Cleanup | |
rm -rf $OUTPUT_DIR | |
rm $TRANSCRIPT_FILE | |
echo "Transcription and post-processing complete." >&2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment