Last active
March 12, 2025 09:33
-
-
Save marc-hanheide/564efaccfaab1b6de1203289ea196bff to your computer and use it in GitHub Desktop.
extract transcript from YouTube video
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# yt-transcript.sh - Extract transcripts from YouTube videos | |
# | |
# Author: Marc Hanheide | |
set -e | |
# Default values | |
LANG="en" | |
FORMAT="plain" | |
OUTPUT_FILE="" | |
KEEP_TEMP=0 | |
TEMP_DIR="/tmp" | |
# Function to display usage information | |
usage() { | |
cat << EOF | |
Usage: $(basename "$0") [OPTIONS] VIDEO_URL | |
Extract transcript from a YouTube video. | |
Options: | |
-h, --help Show this help message and exit | |
-l, --language LANG Language code for transcript (default: en) | |
-f, --format FORMAT Output format: plain, srt, or json (default: plain) | |
-o, --output FILE Output file (default: print to stdout) | |
-k, --keep-temp Keep temporary files | |
-t, --temp-dir DIR Temporary directory (default: /tmp) | |
Example: | |
$(basename "$0") https://www.youtube.com/watch?v=dQw4w9WgXcQ | |
$(basename "$0") -l fr -f srt -o transcript.srt https://www.youtube.com/watch?v=dQw4w9WgXcQ | |
EOF | |
exit 1 | |
} | |
# Function to cleanup temporary files | |
cleanup() { | |
if [ $KEEP_TEMP -eq 0 ]; then | |
rm -f "${TEMP_DIR}/transcript".* | |
fi | |
} | |
# Parse command line arguments | |
POSITIONAL=() | |
while [[ $# -gt 0 ]]; do | |
key="$1" | |
case $key in | |
-h|--help) | |
usage | |
;; | |
-l|--language) | |
LANG="$2" | |
shift 2 | |
;; | |
-f|--format) | |
FORMAT="$2" | |
shift 2 | |
;; | |
-o|--output) | |
OUTPUT_FILE="$2" | |
shift 2 | |
;; | |
-k|--keep-temp) | |
KEEP_TEMP=1 | |
shift | |
;; | |
-t|--temp-dir) | |
TEMP_DIR="$2" | |
shift 2 | |
;; | |
-*) | |
echo "Error: Unknown option $1" | |
usage | |
;; | |
*) | |
POSITIONAL+=("$1") | |
shift | |
;; | |
esac | |
done | |
# Check if a video URL was provided | |
if [ ${#POSITIONAL[@]} -eq 0 ]; then | |
echo "Error: No YouTube URL provided" | |
usage | |
fi | |
# Set the video URL | |
VIDEO_URL=${POSITIONAL[0]} | |
TEMP_BASE="${TEMP_DIR}/transcript" | |
# Register cleanup on exit | |
trap cleanup EXIT | |
yt-dlp --get-description --skip-download "$VIDEO_URL" > "${TEMP_BASE}.description" | |
# Download subtitles | |
yt-dlp --skip-download --write-subs --write-auto-subs \ | |
--sub-lang "$LANG" --sub-format ttml --convert-subs srt \ | |
--output "${TEMP_BASE}.%(ext)s" "$VIDEO_URL" 1>&2 | |
# Process according to requested format | |
case $FORMAT in | |
srt) | |
# Just use the SRT file as is | |
RESULT_CONTENT=$(cat "${TEMP_BASE}.${LANG}.srt") | |
;; | |
json) | |
# Convert to a basic JSON format | |
RESULT_CONTENT=$( | |
# Read description into variable with proper escaping | |
DESCRIPTION=$(cat "${TEMP_BASE}.description" | sed 's/"/\\"/g' | tr '\n' ' ') | |
echo "{" | |
echo " \"description\": \"${DESCRIPTION}\"," | |
echo " \"transcript\": [" | |
awk 'BEGIN{RS=""; FS="\n"} { | |
gsub(/^[0-9]+$/, "", $1); | |
gsub(/<[^>]*>/, "", $3); | |
gsub(/\"/, "\\\"", $3); | |
if (NR > 1) printf ",\n"; | |
printf " {\"time\": \"%s\", \"text\": \"%s\"}", $2, $3; | |
}' "${TEMP_BASE}.${LANG}.srt" | |
echo -e "\n ]" | |
echo "}" | |
) | |
;; | |
plain|*) | |
# Extract plain text | |
RESULT_CONTENT=$( | |
# First get the description | |
cat "${TEMP_BASE}.description" | |
echo -e "\n\n" | |
# Then get the transcript | |
cat "${TEMP_BASE}.${LANG}.srt" | | |
sed '/^$/d' | | |
grep -v '^[0-9]*$' | | |
grep -v '\-->' | | |
sed 's/<[^>]*>//g' | | |
tr '\n' ' ' | |
) | |
;; | |
esac | |
# Output result | |
if [ -n "$OUTPUT_FILE" ]; then | |
echo "$RESULT_CONTENT" > "$OUTPUT_FILE" | |
echo "Transcript saved to $OUTPUT_FILE" >&2 | |
else | |
echo "$RESULT_CONTENT" | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
requires https://github.com/yt-dlp/yt-dlp to be installed