Skip to content

Instantly share code, notes, and snippets.

@marc-hanheide
Last active March 12, 2025 09:33
Show Gist options
  • Save marc-hanheide/564efaccfaab1b6de1203289ea196bff to your computer and use it in GitHub Desktop.
Save marc-hanheide/564efaccfaab1b6de1203289ea196bff to your computer and use it in GitHub Desktop.
extract transcript from YouTube video
#!/bin/bash
# yt-transcript.sh - Extract transcripts from YouTube videos
#
# Author: Marc Hanheide
set -e
# Default values
LANG="en"
FORMAT="plain"
OUTPUT_FILE=""
KEEP_TEMP=0
TEMP_DIR="/tmp"
# Function to display usage information
usage() {
cat << EOF
Usage: $(basename "$0") [OPTIONS] VIDEO_URL
Extract transcript from a YouTube video.
Options:
-h, --help Show this help message and exit
-l, --language LANG Language code for transcript (default: en)
-f, --format FORMAT Output format: plain, srt, or json (default: plain)
-o, --output FILE Output file (default: print to stdout)
-k, --keep-temp Keep temporary files
-t, --temp-dir DIR Temporary directory (default: /tmp)
Example:
$(basename "$0") https://www.youtube.com/watch?v=dQw4w9WgXcQ
$(basename "$0") -l fr -f srt -o transcript.srt https://www.youtube.com/watch?v=dQw4w9WgXcQ
EOF
exit 1
}
# Function to cleanup temporary files
cleanup() {
if [ $KEEP_TEMP -eq 0 ]; then
rm -f "${TEMP_DIR}/transcript".*
fi
}
# Parse command line arguments
POSITIONAL=()
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
-h|--help)
usage
;;
-l|--language)
LANG="$2"
shift 2
;;
-f|--format)
FORMAT="$2"
shift 2
;;
-o|--output)
OUTPUT_FILE="$2"
shift 2
;;
-k|--keep-temp)
KEEP_TEMP=1
shift
;;
-t|--temp-dir)
TEMP_DIR="$2"
shift 2
;;
-*)
echo "Error: Unknown option $1"
usage
;;
*)
POSITIONAL+=("$1")
shift
;;
esac
done
# Check if a video URL was provided
if [ ${#POSITIONAL[@]} -eq 0 ]; then
echo "Error: No YouTube URL provided"
usage
fi
# Set the video URL
VIDEO_URL=${POSITIONAL[0]}
TEMP_BASE="${TEMP_DIR}/transcript"
# Register cleanup on exit
trap cleanup EXIT
yt-dlp --get-description --skip-download "$VIDEO_URL" > "${TEMP_BASE}.description"
# Download subtitles
yt-dlp --skip-download --write-subs --write-auto-subs \
--sub-lang "$LANG" --sub-format ttml --convert-subs srt \
--output "${TEMP_BASE}.%(ext)s" "$VIDEO_URL" 1>&2
# Process according to requested format
case $FORMAT in
srt)
# Just use the SRT file as is
RESULT_CONTENT=$(cat "${TEMP_BASE}.${LANG}.srt")
;;
json)
# Convert to a basic JSON format
RESULT_CONTENT=$(
# Read description into variable with proper escaping
DESCRIPTION=$(cat "${TEMP_BASE}.description" | sed 's/"/\\"/g' | tr '\n' ' ')
echo "{"
echo " \"description\": \"${DESCRIPTION}\","
echo " \"transcript\": ["
awk 'BEGIN{RS=""; FS="\n"} {
gsub(/^[0-9]+$/, "", $1);
gsub(/<[^>]*>/, "", $3);
gsub(/\"/, "\\\"", $3);
if (NR > 1) printf ",\n";
printf " {\"time\": \"%s\", \"text\": \"%s\"}", $2, $3;
}' "${TEMP_BASE}.${LANG}.srt"
echo -e "\n ]"
echo "}"
)
;;
plain|*)
# Extract plain text
RESULT_CONTENT=$(
# First get the description
cat "${TEMP_BASE}.description"
echo -e "\n\n"
# Then get the transcript
cat "${TEMP_BASE}.${LANG}.srt" |
sed '/^$/d' |
grep -v '^[0-9]*$' |
grep -v '\-->' |
sed 's/<[^>]*>//g' |
tr '\n' ' '
)
;;
esac
# Output result
if [ -n "$OUTPUT_FILE" ]; then
echo "$RESULT_CONTENT" > "$OUTPUT_FILE"
echo "Transcript saved to $OUTPUT_FILE" >&2
else
echo "$RESULT_CONTENT"
fi
@marc-hanheide
Copy link
Author

requires https://github.com/yt-dlp/yt-dlp to be installed

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment