rozkminiacz · July 26, 2024 19:12
diff --git a/transcribe.sh b/transcribe.sh
 #!/bin/bash

 # Check if an input file was provided
 if [ "$#" -ne 1 ]; then
    echo "Usage: $0 <input_audio_file>"
    exit 1
 fi

 # Variables
 INPUT_FILE=$1
 BASENAME=$(basename "$INPUT_FILE" | sed 's/\(.*\)\..*/\1/')
 OUTPUT_DIR="chunks"
 WHISPER_API_URL="https://api.openai.com/v1/audio/transcriptions"  # Whisper API URL
 API_KEY="sk-XYZ"  # Whisper API key
 MODEL="whisper-1"  # Set the model parameter
 POST_PROCESS_API_URL="https://api.openai.com/v1/chat/completions"  # Post-processing API URL
 SYSTEM_PROMPT="You are a helpful transcription assistant. Your task is to correct any spelling discrepancies and clean up the transcribed text. Remove filler transcription such as Napisy przygotowane przez XYZ. Only add necessary punctuation such as periods, commas, and capitalization, and use only the context provided."

 # Create output directory if it doesn't exist
 mkdir -p $OUTPUT_DIR

 # Step 1: Split the audio file into 20MB chunks
 ffmpeg -i $INPUT_FILE -f segment -segment_time 120 -c copy $OUTPUT_DIR/output%03d.wav

 # Create a temporary file for the transcript
 TRANSCRIPT_FILE=$(mktemp)

 # Initialize an empty string for the context
 context=""

 # Step 2: Transcribe each chunk and merge transcriptions
 for chunk in $OUTPUT_DIR/*.wav; do
    echo "Processing $chunk..." >&2
    
    # Step 2a: Prepare the prompt with the context
    prompt="\n\nTranscript so far:\n$context\n\n"

    # Step 2b: Send chunk to Whisper API with the context prompt
    response=$(curl -s -X POST -H "Authorization: Bearer $API_KEY" -H "Content-Type: multipart/form-data" \
        -F "file=@${chunk}" -F "model=$MODEL" -F "prompt=$prompt" $WHISPER_API_URL)

    # Step 2c: Extract transcription text from response
    transcription=$(echo $response | jq -r '.text')

    echo $response

    # Step 2d: Append transcription to the final transcription file
    echo "$transcription" >> $TRANSCRIPT_FILE

    # Update context with the current segment's transcription
    context="$context\n$transcription"
 done

 # Step 3: Post-process the full transcription text
 full_transcription=$(cat $TRANSCRIPT_FILE)

 # Properly escape the system prompt and transcription text for JSON
 SYSTEM_PROMPT_ESCAPED=$(echo "$SYSTEM_PROMPT" | jq -sRr @json)
 FULL_TRANSCRIPTION_ESCAPED=$(echo "$full_transcription" | jq -sRr @json)

 # Create the JSON payload
 read -r -d '' JSON_PAYLOAD <<EOF
 {
  "model": "gpt-4o",
  "temperature": 0,
  "messages": [
    {"role": "system", "content": $SYSTEM_PROMPT_ESCAPED},
    {"role": "user", "content": $FULL_TRANSCRIPTION_ESCAPED}
  ]
 }
 EOF

 # Send the post-processing request
 response=$(curl -s -X POST -H "Authorization: Bearer $API_KEY" -H "Content-Type: application/json" \
    -d "$JSON_PAYLOAD" $POST_PROCESS_API_URL)
    
 echo $response

 # Extract the corrected text from the response
 corrected_text=$(echo $response | jq -r '.choices[0].message.content')

 # Output the corrected transcription to a .md file
 OUTPUT_FILE="${BASENAME}.md"
 echo "$corrected_text" > "$OUTPUT_FILE"
 echo "Corrected transcription saved to $OUTPUT_FILE"

 # Cleanup
 rm -rf $OUTPUT_DIR
 rm $TRANSCRIPT_FILE

 echo "Transcription and post-processing complete." >&2
	#!/bin/bash

	# Check if an input file was provided
	if [ "$#" -ne 1 ]; then
	echo "Usage: $0 <input_audio_file>"
	exit 1
	fi

	# Variables
	INPUT_FILE=$1
	BASENAME=$(basename "$INPUT_FILE" \| sed 's/\(.\)\../\1/')
	OUTPUT_DIR="chunks"
	WHISPER_API_URL="https://api.openai.com/v1/audio/transcriptions" # Whisper API URL
	API_KEY="sk-XYZ" # Whisper API key
	MODEL="whisper-1" # Set the model parameter
	POST_PROCESS_API_URL="https://api.openai.com/v1/chat/completions" # Post-processing API URL
	SYSTEM_PROMPT="You are a helpful transcription assistant. Your task is to correct any spelling discrepancies and clean up the transcribed text. Remove filler transcription such as Napisy przygotowane przez XYZ. Only add necessary punctuation such as periods, commas, and capitalization, and use only the context provided."

	# Create output directory if it doesn't exist
	mkdir -p $OUTPUT_DIR

	# Step 1: Split the audio file into 20MB chunks
	ffmpeg -i $INPUT_FILE -f segment -segment_time 120 -c copy $OUTPUT_DIR/output%03d.wav

	# Create a temporary file for the transcript
	TRANSCRIPT_FILE=$(mktemp)

	# Initialize an empty string for the context
	context=""

	# Step 2: Transcribe each chunk and merge transcriptions
	for chunk in $OUTPUT_DIR/*.wav; do
	echo "Processing $chunk..." >&2

	# Step 2a: Prepare the prompt with the context
	prompt="\n\nTranscript so far:\n$context\n\n"

	# Step 2b: Send chunk to Whisper API with the context prompt
	response=$(curl -s -X POST -H "Authorization: Bearer $API_KEY" -H "Content-Type: multipart/form-data" \
	-F "file=@${chunk}" -F "model=$MODEL" -F "prompt=$prompt" $WHISPER_API_URL)

	# Step 2c: Extract transcription text from response
	transcription=$(echo $response \| jq -r '.text')

	echo $response

	# Step 2d: Append transcription to the final transcription file
	echo "$transcription" >> $TRANSCRIPT_FILE

	# Update context with the current segment's transcription
	context="$context\n$transcription"
	done

	# Step 3: Post-process the full transcription text
	full_transcription=$(cat $TRANSCRIPT_FILE)

	# Properly escape the system prompt and transcription text for JSON
	SYSTEM_PROMPT_ESCAPED=$(echo "$SYSTEM_PROMPT" \| jq -sRr @json)
	FULL_TRANSCRIPTION_ESCAPED=$(echo "$full_transcription" \| jq -sRr @json)

	# Create the JSON payload
	read -r -d '' JSON_PAYLOAD <<EOF
	{
	"model": "gpt-4o",
	"temperature": 0,
	"messages": [
	{"role": "system", "content": $SYSTEM_PROMPT_ESCAPED},
	{"role": "user", "content": $FULL_TRANSCRIPTION_ESCAPED}
	]
	}
	EOF

	# Send the post-processing request
	response=$(curl -s -X POST -H "Authorization: Bearer $API_KEY" -H "Content-Type: application/json" \
	-d "$JSON_PAYLOAD" $POST_PROCESS_API_URL)

	echo $response

	# Extract the corrected text from the response
	corrected_text=$(echo $response \| jq -r '.choices[0].message.content')

	# Output the corrected transcription to a .md file
	OUTPUT_FILE="${BASENAME}.md"
	echo "$corrected_text" > "$OUTPUT_FILE"
	echo "Corrected transcription saved to $OUTPUT_FILE"

	# Cleanup
	rm -rf $OUTPUT_DIR
	rm $TRANSCRIPT_FILE

	echo "Transcription and post-processing complete." >&2