Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save AustinSaintAubin/a50b29ce52de5501a6dd05bf5d24cd44 to your computer and use it in GitHub Desktop.
Save AustinSaintAubin/a50b29ce52de5501a6dd05bf5d24cd44 to your computer and use it in GitHub Desktop.
Whisper ASR Webservice | File Processing Script | 2023/09/23 | v2.8
#!/bin/bash
echo "Whisper ASR Webservice | File Proccessing Script | 2023/09/23 | v2.8"
echo "Author: Austin St. Aubin w/ a little help from ChatGPT."
echo "License: MIT License"
# This is a bash script that transcribes an audio file using a web service and outputs the transcript in various formats.
# The audio file is located at the SOURCE_PATH and the transcripts will be saved to the same directory with transcriptions as specified in TRANSCRIPT_EXTENSIONS.
# https://github.com/ahmetoner/whisper-asr-webservice/issues/93
# https://gist.github.com/AustinSaintAubin/a50b29ce52de5501a6dd05bf5d24cd44
# - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# bash /volume1/docker/whisper-asr-webservice/whisper-asr-webservice_file-processor.sh "/volume1/docker/whisper-asr-webservice/audio" --output "txt,tsv,srt,vtt,json" --gpu --reprocess
# -------------------------------------------------------
# Default Values
WISPER_ASR_SERVER_IP="localhost" # "192.168.20.27" # "192.168.10.91"
WISPER_ASR_SERVER_CPU="http://${WISPER_ASR_SERVER_IP}:9005" # Instance for CPU processing
WISPER_ASR_SERVER_GPU="http://${WISPER_ASR_SERVER_IP}:9006" # Instance for GPU processing
# WISPER_ASR_SERVER="${WISPER_ASR_SERVER:-WISPER_ASR_SERVER_GPU}" # "http://localhost:9000"
WISPER_ASR_TASK="transcribe"
WISPER_ASR_LANG="en"
WISPER_ASR_INITIAL_PROMPT="" # "- Hey how are you doing? - I'm doing good. How are you?"
TRANSCRIPT_EXTENSIONS="txt,tsv,srt" # "txt,tsv,srt,vtt,json" are the supported extensions for the transcripts.
DESTINATION_OUTPUT_PRINT=false # true/false | print the contents of the destination file to termnal
DESTINATION_REPROCESS=false # true/false | If true, the script will overwrite the destination file if it already exists.
SOURCE_TEMP_TRANSCODING_KEEP=false # true/false | delete the temparary transcoding file once script finishes
# User Output Helper
helper_output () {
echo "Usage: $(basename "$0") AUDIO-FILE [OPTIONS]
NOTE: Path to the source audio file/folder is the only input required. Can be passed without flag. All other parrameaters/options are optional.
-i, --input Path to input audio file. (mp3|mp4|wav|m4a)
-o, --output List of transcription output types, listed by extention, comma seperated. (txt,tsv,srt,vtt,json)
-s, --server Server Address. (http://localhost:9000)
-t, --task Task for the server. (transcribe)
-l, --lang Language of source audio file (en)
-e,--initial-prompt Intital Prompt for use in adding contect to wisper proccesing.
-p, --print Print contents of destination file when done.
-r, --reprocess Delete and reproccess selected transcription types.
-k, --keep-temp Keep converted tempararty audio files from M4A transcoding.
-d, --dest-dir Destination dirrectory, if differs from source.
-c/-g, --cpu / --gpu Secifies which Wisper ASR Webservice Server to use... the one for CPU or GPU realted task.
--help Shows this guide.
" >&2
echo "Example: bash $(realpath "$0") \"$(pwd)\" --output \"txt,srt\" --print --gpu"
}
# time durration
function format_time() {
local seconds=$1
local minutes=$(( seconds / 60 ))
local hours=$(( minutes / 60 ))
local remainder_seconds=$(( seconds % 60 ))
local remainder_minutes=$(( minutes % 60 ))
printf 'Duration: %02d hours, %02d minutes, %02d seconds' $hours $remainder_minutes $remainder_seconds
printf ' = %02d:%02d:%02d\n' $hours $remainder_minutes $remainder_seconds
}
# file size
function format_file_size() {
ls -l --all --human-readable --size "$@" | awk -F " " {'print $1'}
}
# file audio dirration
function format_audio_durration() {
ffmpeg -i "$@" 2>&1 | grep 'Duration' | grep -oP "[0-9]{2}:[0-9]{2}:[0-9]{2}" # [0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{2}
}
function output_task_file_info() {
local seconds=$1
local filepath=$2
echo " └─ Task $(format_time ${seconds}) | File Size: $(format_file_size ${filepath}) | Audio Durration: $(format_audio_durration ${filepath})"
}
# Parse Options
options=$(getopt --quiet -o i:o:s:t:l:e:d:prkcgh --long input:,output:,server:,task:,lang:,initial-prompt:,dest-dir:,print,reprocess,keep-temp,cpu,gpu,help -- "$@") >&2
if [ $? -ne 0 ]; then
echo "ERROR: UNKNOWN OPTIONS INPUT!"
helper_output
exit 1
fi
# Parser
eval set -- "$options"
while true; do
case $1 in
-i|--input)
SOURCE_PATH="${2}"
shift 2
;;
-o|--output)
TRANSCRIPT_EXTENSIONS="${2}"
shift 2
;;
-s|--server)
WISPER_ASR_SERVER="${2}"
shift 2
;;
-t|--task)
WISPER_ASR_TASK="${2}"
shift 2
;;
-l|--lang)
WISPER_ASR_LANG="${2}"
shift 2
;;
-e|--initial-prompt)
WISPER_ASR_INITIAL_PROMPT="${2}"
shift 2
;;
-d|--dest-dir)
USER_DEFINED_DESTINATION_PATH_DIRECTORY="${2}"
shift 2
;;
-p|--print)
DESTINATION_OUTPUT_PRINT=true
shift
;;
-r|--reprocess)
DESTINATION_REPROCESS=true
shift
;;
-k|--keep-temp)
SOURCE_TEMP_TRANSCODING_KEEP=true
shift
;;
-c|--cpu)
WISPER_ASR_SERVER="${WISPER_ASR_SERVER_CPU}"
shift
;;
-g|--gpu)
WISPER_ASR_SERVER="${WISPER_ASR_SERVER_GPU}"
shift
;;
-h|--help)
helper_output
exit 0
;;
--)
shift
break
;;
*)
echo "ERROR: UNKNOWN OPTIONS CASE!"
helper_output
exit 1
;;
esac
done
# Shift the parsed options to leave only the remaining arguments
shift "$(($OPTIND -1))"
# Handle remaining arguments
if [ $# -eq 1 ]; then
if [ -z "${SOURCE_PATH}" ]; then
SOURCE_PATH="${1}"
else
echo "ERROR: Too many arguments. Seems like two source file paths specifited."
helper_output
exit 1
fi
elif [ $# -gt 1 ]; then
echo "ERROR: Too many unflagged arguments"
helper_output
exit 1
fi
# Error Checking
# check if the parent_folder and transcript_output_extention argument was provided
if [[ $# -lt 1 ]]; then
echo "Usage: $(basename "$0") SOURCE_PATH TRANSCRIPT_EXTENSIONS DESTINATION_REPROCESS SOURCE_TEMP_TRANSCODING_KEEP"
echo "Example: bash $(realpath "$0") \"$(pwd)\" \"txt,tsv\" \"true\" \"false\" \"false\""
exit 1
fi
# define the transcript file extension(s)
IFS=',' read -ra TRANSCRIPT_EXTENSIONS <<< "${2:-$TRANSCRIPT_EXTENSIONS}" # txt, vtt, srt, tsv, json
# check if any transcript extensions were provided
if [ ${#TRANSCRIPT_EXTENSIONS[@]} -eq 0 ]; then
echo "No transcript extensions were provided."
exit 1
fi
# check if the parent folder exists
if [[ ! -e "${SOURCE_PATH}" ]]; then # True if the FILE exists and is a file, regardless of type (node, directory, socket, etc.).
echo "Source Path '${SOURCE_PATH}' does not exist"
exit 1
fi
# set default server if not set
WISPER_ASR_SERVER="${WISPER_ASR_SERVER:-$WISPER_ASR_SERVER_GPU}"
# Formate Intial-Promot to URL-Encoded
WISPER_ASR_INITIAL_PROMPT_URL_ENCODED="$(jq -rn --arg x "${WISPER_ASR_INITIAL_PROMPT}" '$x|@uri')"
# Print Header
echo " - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -"
echo "Whisper ASR Webservice | v$(curl -s "${WISPER_ASR_SERVER}/openapi.json" | jq -r '.info.version') " # | ${WISPER_ASR_SERVER} | ${WISPER_ASR_TASK} | ${WISPER_ASR_LANG} | ${WISPER_ASR_INITIAL_PROMPT}"
echo "$(curl -s "${WISPER_ASR_SERVER}/openapi.json" | jq -r '.info.description')"
echo "Server: ${WISPER_ASR_SERVER}"
echo "Task: ${WISPER_ASR_TASK}"
echo "Lang: ${WISPER_ASR_LANG}"
echo "Initial Prompt: ${WISPER_ASR_INITIAL_PROMPT}"
# echo "Encoded Prompt: ${WISPER_ASR_INITIAL_PROMPT_URL_ENCODED}"
echo " - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -"
echo "Source Path: ${SOURCE_PATH}"
if [[ -v USER_DEFINED_DESTINATION_PATH_DIRECTORY ]]; then echo "${USER_DEFINED_DESTINATION_PATH_DIRECTORY}";fi
echo "Transcript Extensions: ${TRANSCRIPT_EXTENSIONS[*]}"
echo "Destination Output Print: ${DESTINATION_OUTPUT_PRINT}"
echo "Destination Reprocess: ${DESTINATION_REPROCESS}"
echo "Source Temp Transcoding Keep: ${SOURCE_TEMP_TRANSCODING_KEEP}"
echo "============================================================================"
# find all audio files in the source path folder and its subdirectories
find "${SOURCE_PATH}" -type f \( -name "*.mp3" -o -name "*.mp4" -o -name "*.wav" -o -name "*.m4a" \) | \
while read -r SOURCE_PATH_FILE; do
# Path Varables
SOURCE_PATH_DIRECTORY="${SOURCE_PATH_FILE%/*}"
DESTINATION_PATH_DIRECTORY="${USER_DEFINED_DESTINATION_PATH_DIRECTORY:-$SOURCE_PATH_DIRECTORY}"
SOURCE_PATH_BASENAME="$(basename -- "$SOURCE_PATH_FILE")"
SOURCE_PATH_FILENAME="${SOURCE_PATH_BASENAME%.*}"
#SOURCE_PATH_EXTENSION="${basename##*.}"
SOURCE_TEMP_TRANSCODING_BASENAME="${SOURCE_PATH_FILENAME}.temp.mp3"
SOURCE_TEMP_TRANSCODING_PATH="${DESTINATION_PATH_DIRECTORY}/${SOURCE_TEMP_TRANSCODING_BASENAME}"
# Loop through each transcript extension
IFS=,
for TRANSCRIPT_EXTENSION in ${TRANSCRIPT_EXTENSIONS[@]}; do
# Remove any white space from TRANSCRIPT_EXTENSION
TRANSCRIPT_EXTENSION=$(echo "${TRANSCRIPT_EXTENSION}" | xargs)
# Set the destination path for the current transcript extension
# DESTINATION_PATH="${SOURCE_PATH_FILE%.*}.${TRANSCRIPT_EXTENSION}"
DESTINATION_PATH="${DESTINATION_PATH_DIRECTORY}/${SOURCE_PATH_FILENAME}.${TRANSCRIPT_EXTENSION}"
# If destination reprocessing is true and the destination file already exists, delete it
if [[ ${DESTINATION_REPROCESS} == true ]] && [ -e "${DESTINATION_PATH}" ]; then
echo "Deleting old destination file: ${DESTINATION_PATH}"
rm "${DESTINATION_PATH}"
fi
# then, check if a transcript file already exists for this audio file
if [ -f "${DESTINATION_PATH}" ]; then
echo "Transcript already exists: ${SOURCE_PATH_FILE} | ${TRANSCRIPT_EXTENSION}"
else
# transcoding m4a to mp3 or wav so is usable for transcribing, unless already transscribed (file exist and is not empty).
if [[ "${SOURCE_PATH_FILE}" == *.m4a ]] && ([[ ! -f "${SOURCE_TEMP_TRANSCODING_PATH}" ]] || [[ ! -s "${SOURCE_TEMP_TRANSCODING_PATH}" ]]); then
# if ([[ "${SOURCE_PATH_FILE}" == *.m4a ]] || [[ "${SOURCE_PATH_FILE}" == *.mp4 ]]) && ([[ ! -f "${SOURCE_TEMP_TRANSCODING_PATH}" ]] || [[ ! -s "${SOURCE_TEMP_TRANSCODING_PATH}" ]]); then
echo "Transcoding to MP3: ${SOURCE_PATH_FILE} -> ${SOURCE_TEMP_TRANSCODING_BASENAME} | ${TRANSCRIPT_EXTENSION}"
# convert the file to mp3 using FFmpeg container
SECONDS=0
docker run --rm --volume "${SOURCE_PATH_DIRECTORY}:${SOURCE_PATH_DIRECTORY}" --volume "${DESTINATION_PATH_DIRECTORY}:${DESTINATION_PATH_DIRECTORY}" --workdir "${SOURCE_PATH_DIRECTORY}" jrottenberg/ffmpeg \
-i "${SOURCE_PATH_BASENAME}" \
-loglevel fatal -hide_banner -stats \
-acodec libmp3lame -ac 1 -ar 16000 -ab 192k -y "${DESTINATION_PATH_DIRECTORY}/${SOURCE_TEMP_TRANSCODING_BASENAME}" && output_task_file_info ${SECONDS} ${SOURCE_PATH_FILE} || { echo "Failed to convert file to MP3."; [ -e "${SOURCE_TEMP_TRANSCODING_PATH}" ] && rm "${SOURCE_TEMP_TRANSCODING_PATH}"; exit 1; }
# -acodec libmp3lame -ac 1 -ar 16000 -ab 192k -y "${SOURCE_TEMP_TRANSCODING_BASENAME}" || { echo "Failed to convert file to MP3."; [ -e "${SOURCE_TEMP_TRANSCODING_PATH}" ] && rm "${SOURCE_TEMP_TRANSCODING_PATH}"; exit 1; }
# MP3 # -acodec libmp3lame -ac 2 -ar 44100 -ab 192k -y "${SOURCE_TEMP_TRANSCODING_BASENAME}" || { echo "Failed to convert file to MP3."; [ -e "${SOURCE_TEMP_TRANSCODING_PATH}" ] && rm "${SOURCE_TEMP_TRANSCODING_PATH}"; exit 1; }
# WAV # -acodec pcm_s16le -ac 1 -ar 16000 -y "${SOURCE_TEMP_TRANSCODING_BASENAME}" || { echo "Failed to convert file to WAV."; [ -e "${SOURCE_TEMP_TRANSCODING_PATH}" ] && rm "${SOURCE_TEMP_TRANSCODING_PATH}"; exit 1; }
fi
# set asr source path to source path of raw file, or transcoded file
if [[ "${SOURCE_PATH_FILE}" == *.m4a ]] && [[ -f "${SOURCE_TEMP_TRANSCODING_PATH}" ]]; then
# if ([[ "${SOURCE_PATH_FILE}" == *.m4a ]] || [[ "${SOURCE_PATH_FILE}" == *.mp4 ]]) && [[ -f "${SOURCE_TEMP_TRANSCODING_PATH}" ]]; then
ASR_SOURCE_PATH="${SOURCE_TEMP_TRANSCODING_PATH}"
else
ASR_SOURCE_PATH="${SOURCE_PATH_FILE}"
fi
# Print the source and destination paths along with the transcript extension
echo "Sending to ${WISPER_ASR_TASK^}: ${ASR_SOURCE_PATH} -> $(basename -- "$DESTINATION_PATH") [$(format_file_size ${ASR_SOURCE_PATH})|$(format_audio_durration ${ASR_SOURCE_PATH})]" # | $([[ -f "${DESTINATION_PATH}" ]] && echo true || echo false)"
# Send the audio file to the web service for transcription
SECONDS=0
if ! curl --progress-bar --request 'POST' \
"${WISPER_ASR_SERVER}/asr?task=${WISPER_ASR_TASK}&language=${WISPER_ASR_LANG}&initial_prompt=${WISPER_ASR_INITIAL_PROMPT_URL_ENCODED}&output=${TRANSCRIPT_EXTENSION}" \
--header 'accept: application/json' \
--header 'Content-Type: multipart/form-data' \
--form "audio_file=@${ASR_SOURCE_PATH};type=audio/mpeg" \
--output "${DESTINATION_PATH}"; then
echo "Failed to transcribe file: ${DESTINATION_PATH}"
exit 1
elif [[ -f "${DESTINATION_PATH}" ]]; then
echo "Successful Transcription: ${DESTINATION_PATH}"
output_task_file_info ${SECONDS} ${SOURCE_PATH_FILE}
else
echo "Unknown Transcription Failure!"
fi
# print output from destination file.
if [[ "${DESTINATION_OUTPUT_PRINT}" == true ]]; then
# Print a separator
echo " - - - - - - - - - - - - - - - - - - -"
# Print the contents of the destination file
cat "${DESTINATION_PATH}"
fi
fi
# Print a separator
if [[ ${#TRANSCRIPT_EXTENSIONS[@]} -gt 1 ]]; then
echo " - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -"
fi
done
# If source temporary transcoding file exist, delete it.
if [[ -e "${SOURCE_TEMP_TRANSCODING_PATH}" ]] && [[ ${SOURCE_TEMP_TRANSCODING_KEEP} != true ]]; then
echo "Deleting Temp Transconding Audio File: ${SOURCE_TEMP_TRANSCODING_PATH}"
rm "${SOURCE_TEMP_TRANSCODING_PATH}"
fi
# Print a separator
echo "----------------------------------------------------------------------------"
done
@markgir
Copy link

markgir commented Mar 6, 2024

hello,
can you help me with setup. i dont know how to work with these, but know that is what i need.

@AustinSaintAubin
Copy link
Author

hello, can you help me with setup. i dont know how to work with these, but know that is what i need.

Ya, what kind of help do you need? What are you trying to do specifically?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment