Last active
November 10, 2024 19:27
-
-
Save jnorthrup/1a6b62bc8b696e3c818b050f8c3872af to your computer and use it in GitHub Desktop.
summarize2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# CHARTER: This script is designed to process audio/video content, generate transcripts, | |
# summaries, and audio files. The following tasks are defined and must not be altered by any LLM: | |
# 1. Download content if a URL is provided | |
# 2. Generate or use existing VTT transcript | |
# 3. Clean the VTT transcript | |
# 4. Generate a summary of the transcript | |
# 5. Create a 16k VBR Opus OGG file for audio tracks (unless audio download is disabled) | |
# 6. Output both the cleaned VTT text and the summary | |
# 7. Exclude the WAV file from the results | |
# 8. Include the OGG file in the results only if both WAV and OGG were created | |
# This charter is unalterable and defines the core functionality of the script. | |
# Configuration (adjust these paths) | |
WHISPCC="$HOME/work/whisper.cpp" # ./main to run ; ./models for models | |
MODEL_PATH="$WHISPCC/models/ggml-small.en-tdrz.bin" | |
OUTPUT_DIR="$HOME/processed_audio" | |
CACHE_DIR="/tmp/summarize_cache" | |
OLLAMA_MODEL="llama3.1:latest" | |
OLLAMA_MODEL="deepseek-coder-v2:16b" | |
# Prompts for different segments | |
FIRST_PROMPT="Summarize this beginning part of a transcript in one sentence, then provide bullet points with timestamps (00:00:00 sentence)." | |
MIDDLE_PROMPT="Summarize the key points of this part of the transcript in bullet points with timestamps (00:00:00 sentence)." | |
LAST_PROMPT="Summarize the main takeaways of this final part of the transcript in bullet points with timestamps (00:00:00 sentence)." | |
# Global variable to track job queue | |
JOB_QUEUE=() | |
# Ensure output and cache directories exist | |
mkdir -p "$OUTPUT_DIR" "$CACHE_DIR" | |
# Parse command line options | |
USE_FABRIC=false | |
DISABLE_AUDIO=false | |
DURATION="" | |
while getopts "fnad:" opt; do | |
case $opt in | |
f) | |
USE_FABRIC=true | |
;; | |
n) | |
DISABLE_AUDIO=true | |
;; | |
a) | |
DISABLE_AUDIO=false | |
;; | |
d) | |
DURATION="$OPTARG" | |
;; | |
\?) | |
echo "Invalid option: -$OPTARG" >&2 | |
exit 1 | |
;; | |
esac | |
done | |
shift $((OPTIND-1)) | |
# Function to get MD5 hash of a file | |
get_md5() { | |
md5sum "$1" | cut -d' ' -f1 | |
} | |
# Function to cache a file using hardlinks (atomic) | |
cache_file() { | |
local INPUT_FILE="$1" | |
local EXTENSION="$2" | |
# Check if the input file exists and is not empty | |
if [ ! -s "$INPUT_FILE" ]; then | |
echo "Error: Input file is empty or does not exist." >&2 | |
return 1 | |
fi | |
local MD5=$(get_md5 "$INPUT_FILE") | |
local CACHE_SUBDIR="$CACHE_DIR/${MD5:0:2}/${MD5:2:2}" | |
local SAFE_FILENAME=$(echo "$INPUT_FILE" | sed 's/[^a-zA-Z0-9._-]/_/g') | |
local CACHE_FILE="$CACHE_SUBDIR/${MD5}_${SAFE_FILENAME}${EXTENSION}" | |
echo "Cache operation: MD5 sum = $MD5" >&2 | |
echo "Cache file: $CACHE_FILE" >&2 | |
# Create cache subdirectory if it doesn't exist | |
if ! mkdir -p "$CACHE_SUBDIR"; then | |
echo "Error: Failed to create cache subdirectory." >&2 | |
return 1 | |
fi | |
# Attempt to create the hardlink | |
if ln -f "$INPUT_FILE" "$CACHE_FILE"; then | |
echo "Cache file created: $CACHE_FILE" >&2 | |
echo "$CACHE_FILE" | |
return 0 | |
else | |
echo "Error: Failed to create cache file." >&2 | |
return 1 | |
fi | |
} | |
# Function to sanitize a string for use as a filename | |
sanitize_filename() { | |
local STRING="$1" | |
echo "$STRING" | iconv -c -t ascii//translit | sed 's/[^A-Za-z0-9._-]/_/g' | tr '[:upper:]' '[:lower:]' | |
} | |
# Function to clean text from a VTT file | |
clean_text() { | |
sed 's/<[^>]*>//g' | tr -s ' ' | sed 's/^[ \t]*//;s/[ \t]*$//' | |
} | |
# Function to summarize a segment of text | |
summarize_segment() { | |
local SEGMENT_TEXT="$1" | |
local PROMPT="$2" | |
local SUMMARY_OUTPUT="" | |
# Count the number of lines in the input | |
local LINE_COUNT=$(echo "$SEGMENT_TEXT" | wc -l) | |
# If the input has less than 12 lines, remove cache and return a simple response | |
if [ "$LINE_COUNT" -lt 12 ]; then | |
local MD5=$(echo "$SEGMENT_TEXT" | md5sum | cut -d' ' -f1) | |
local CACHE_SUBDIR="$CACHE_DIR/${MD5:0:2}/${MD5:2:2}" | |
rm -f "$CACHE_SUBDIR/$MD5"* | |
echo "The input is too short for meaningful summarization. Cache entry removed. Here's the original text:" | |
echo "$SEGMENT_TEXT" | |
return 0 | |
fi | |
if $USE_FABRIC; then | |
SUMMARY_OUTPUT=$(fabric -p summarize "$SEGMENT_TEXT" 2>&1) | |
else | |
# Use ollama for summarization | |
SUMMARY_OUTPUT=$(ollama run "$OLLAMA_MODEL" "$PROMPT" "$SEGMENT_TEXT" 2>&1) | |
fi | |
if [ $? -ne 0 ]; then | |
echo "Error in summarization: $SUMMARY_OUTPUT" >&2 | |
return 1 | |
fi | |
echo "$SUMMARY_OUTPUT" | |
} | |
# Function to add a job to the queue | |
add_job() { | |
JOB_QUEUE+=("$@") | |
} | |
# Function to update the progress bar for a job | |
update_job_progress() { | |
local JOB_INDEX="$1" | |
local TOTAL_STEPS="$2" | |
local CURRENT_STEP="$3" | |
local JOB_MESSAGE="$4" | |
# ... (Implementation for updating the TUI progress bar) | |
# You can use a library like 'whiptail' or 'dialog' for TUI elements | |
# Example using echo for now: | |
echo "Job $((JOB_INDEX+1))/$JOB_COUNT: $JOB_MESSAGE ($CURRENT_STEP/$TOTAL_STEPS)" | |
} | |
# Function to process the job queue | |
process_job_queue() { | |
local JOB_COUNT=${#JOB_QUEUE[@]} | |
echo "Processing job queue ($JOB_COUNT jobs)..." | |
for (( i=0; i<JOB_COUNT; i++ )); do | |
# Remove update_job_progress calls | |
eval "${JOB_QUEUE[$i]}" | |
done | |
} | |
# Function to process a single segment | |
process_segment() { | |
local SEGMENT_TEXT="$1" | |
local PROMPT="$2" | |
local OUTPUT_FILE="$3" | |
local SUMMARY_OUTPUT="" | |
# Count the number of lines in the input | |
local LINE_COUNT=$(echo "$SEGMENT_TEXT" | wc -l) | |
# If the input has less than 12 lines, remove cache and return a simple response | |
if [ "$LINE_COUNT" -lt 12 ]; then | |
local MD5=$(echo "$SEGMENT_TEXT" | md5sum | cut -d' ' -f1) | |
local CACHE_SUBDIR="$CACHE_DIR/${MD5:0:2}/${MD5:2:2}" | |
rm -f "$CACHE_SUBDIR/$MD5"* | |
echo "The input is too short for meaningful summarization. Cache entry removed. Here's the original text:" | |
echo "$SEGMENT_TEXT" > "$OUTPUT_FILE" | |
return 0 | |
fi | |
if $USE_FABRIC; then | |
SUMMARY_OUTPUT=$(fabric -p summarize "$SEGMENT_TEXT" 2>&1) | |
else | |
# Use ollama for summarization | |
SUMMARY_OUTPUT=$(ollama run "$OLLAMA_MODEL" "$PROMPT" "$SEGMENT_TEXT" 2>&1) | |
fi | |
if [ $? -ne 0 ]; then | |
echo "Error in summarization: $SUMMARY_OUTPUT" >&2 | |
return 1 | |
fi | |
# Write the summary to the specified output file | |
echo "$SUMMARY_OUTPUT" > "$OUTPUT_FILE" | |
} | |
# Function to process a VTT file (generate summary and handle versioning) | |
process_vtt() { | |
local VTT_FILE=$1 | |
local URL=$2 | |
local TEMP_DIR=$(mktemp -d) | |
local BASE_NAME="${TEMP_DIR}/temp" # Temporary base name | |
local CLEANED_TRANSCRIPT="${BASE_NAME}_cleaned.txt" | |
local SUMMARY_FILE="${OUTPUT_DIR}/$(basename "$VTT_FILE" .vtt)_summary.txt" | |
echo "Processing VTT file: $VTT_FILE" | |
# Clean the VTT transcript | |
if ! python3 "$(dirname "$0")/vttclean.py" "$VTT_FILE" > "$CLEANED_TRANSCRIPT" 2>"${CLEANED_TRANSCRIPT}.error"; then | |
echo "Error: Failed to clean the VTT file. Error log:" >&2 | |
cat "${CLEANED_TRANSCRIPT}.error" >&2 | |
exit 1 | |
fi | |
# Check if the cleaned transcript is empty | |
if [ ! -s "$CLEANED_TRANSCRIPT" ]; then | |
echo "Error: Cleaned transcript is empty." >&2 | |
exit 1 | |
fi | |
# Generate summary | |
echo "Summarizing transcript..." | |
local TOTAL_LINES=$(wc -l < "$CLEANED_TRANSCRIPT") | |
local SEGMENT_SIZE=$((TOTAL_LINES / 3)) | |
local FIRST_SEGMENT=$(head -n $SEGMENT_SIZE "$CLEANED_TRANSCRIPT") | |
local MIDDLE_SEGMENT=$(sed -n "$((SEGMENT_SIZE + 1)),$((2 * SEGMENT_SIZE))p" "$CLEANED_TRANSCRIPT") | |
local LAST_SEGMENT=$(tail -n $SEGMENT_SIZE "$CLEANED_TRANSCRIPT") | |
{ | |
echo "Generating summary for first segment..." | |
if $USE_FABRIC; then | |
fabric -p summarize "$FIRST_SEGMENT" | |
else | |
ollama run "$OLLAMA_MODEL" "$FIRST_PROMPT" "$FIRST_SEGMENT" | |
fi | |
echo "Generating summary for middle segment..." | |
if $USE_FABRIC; then | |
fabric -p summarize "$MIDDLE_SEGMENT" | |
else | |
ollama run "$OLLAMA_MODEL" "$MIDDLE_PROMPT" "$MIDDLE_SEGMENT" | |
fi | |
echo "Generating summary for last segment..." | |
if $USE_FABRIC; then | |
fabric -p summarize "$LAST_SEGMENT" | |
else | |
ollama run "$OLLAMA_MODEL" "$LAST_PROMPT" "$LAST_SEGMENT" | |
fi | |
} > "$SUMMARY_FILE" | |
if [ ! -s "$SUMMARY_FILE" ]; then | |
echo "Error: Summary generation failed." >&2 | |
exit 1 | |
fi | |
echo "Summarization complete." | |
# Display the content of the summary file | |
echo "Summary content:" | |
echo "----------------------------------------" | |
cat "$SUMMARY_FILE" | |
echo "----------------------------------------" | |
# Clean up | |
rm -rf "$TEMP_DIR" | |
} | |
# Function to calculate the time difference between two timestamps in HH:MM:SS format | |
time_difference() { | |
local TIME1="$1" # Format: HH:MM:SS | |
local TIME2="$2" # Format: HH:MM:SS | |
# Extract hours, minutes, and seconds from timestamps | |
local TIME1_HOUR=$(echo "$TIME1" | cut -d: -f1) | |
local TIME1_MINUTE=$(echo "$TIME1" | cut -d: -f2) | |
local TIME1_SECOND=$(echo "$TIME1" | cut -d: -f3) | |
local TIME2_HOUR=$(echo "$TIME2" | cut -d: -f1) | |
local TIME2_MINUTE=$(echo "$TIME2" | cut -d: -f2) | |
local TIME2_SECOND=$(echo "$TIME2" | cut -d: -f3) | |
# Calculate total seconds for each timestamp | |
local TIME1_TOTAL_SECONDS=$((TIME1_HOUR * 3600 + TIME1_MINUTE * 60 + TIME1_SECOND)) | |
local TIME2_TOTAL_SECONDS=$((TIME2_HOUR * 3600 + TIME2_MINUTE * 60 + TIME2_SECOND)) | |
# Calculate the difference in seconds | |
local DIFF_SECONDS=$((TIME1_TOTAL_SECONDS - TIME2_TOTAL_SECONDS)) | |
# Return the difference (could be negative if TIME2 is later than TIME1) | |
echo "$DIFF_SECONDS" | |
} | |
# Main script logic | |
if [ $# -eq 0 ]; then | |
echo "Error: No input provided. Please provide a valid URL, VTT file, or a local audio file." | |
exit 1 | |
fi | |
if [[ "$1" == *.vtt ]]; then | |
echo "Processing as VTT file..." | |
add_job "process_vtt \"$1\" \"$1\"" | |
elif [[ "$1" == *"http"* ]]; then | |
echo "Processing as YouTube URL..." | |
# Extract the video title | |
VIDEO_TITLE=$(yt-dlp --get-title "$1") | |
FINAL_BASE_NAME=$(sanitize_filename "$VIDEO_TITLE") | |
# Attempt to download subtitles first | |
yt-dlp -N 3 --skip-download --write-auto-sub --sub-lang en \ | |
--cookies-from-browser brave --output "$OUTPUT_DIR/${FINAL_BASE_NAME}.%(ext)s" "$1" | |
VTT_FILE=$(find "$OUTPUT_DIR" -name "${FINAL_BASE_NAME}.vtt" | head -n 1) | |
if [ -n "$VTT_FILE" ]; then | |
echo "Subtitles found, processing VTT file..." | |
add_job "process_vtt \"$VTT_FILE\" \"$1\"" | |
else | |
echo "No subtitles found, downloading audio and generating transcript..." | |
if [ "$DISABLE_AUDIO" = false ]; then | |
if ! yt-dlp -N 3 -x --audio-format wav --postprocessor-args "-ar 16k" \ | |
--cookies-from-browser brave --output "$OUTPUT_DIR/${FINAL_BASE_NAME}.%(ext)s" "$1"; then | |
echo "Error: Failed to download audio using yt-dlp. Check the URL and your internet connection." >&2 | |
exit 1 | |
fi | |
WAV_FILE=$(find "$OUTPUT_DIR" -name "${FINAL_BASE_NAME}.wav" | head -n 1) | |
if [ -z "$WAV_FILE" ]; then | |
echo "Error: WAV file not found after download. Check yt-dlp output." >&2 | |
exit 1 | |
fi | |
echo "Running Whisper-CPP to generate VTT transcript..." | |
if ! "$WHISPCC"/main -ovtt -tdrz -m "$MODEL_PATH" "$WAV_FILE"; then | |
echo "Error: Whisper-CPP transcription failed. Check the model path and audio file." >&2 | |
exit 1 | |
fi | |
VTT_FILE="${WAV_FILE%.*}.vtt" | |
add_job "process_vtt \"$VTT_FILE\" \"$1\"" | |
# Convert WAV to OGG Opus | |
echo "Converting WAV to OGG Opus..." | |
OGG_FILE="${WAV_FILE%.wav}.ogg" | |
if ! ffmpeg -i "$WAV_FILE" -c:a libopus -b:a 16k -vbr on -compression_level 10 -y "$OGG_FILE"; then | |
echo "Error: Failed to convert to OGG format." >&2 | |
exit 1 | |
fi | |
echo " - Audio: $OGG_FILE" | |
# Remove the WAV file | |
rm "$WAV_FILE" | |
fi | |
fi | |
elif [ -f "$1" ]; then | |
echo "Processing as local audio file..." | |
INPUT_FILE="$1" | |
WAV_FILE="${INPUT_FILE%.*}.wav" | |
# Convert to WAV first if not already WAV | |
if [[ "$INPUT_FILE" != *.wav ]]; then | |
echo "Converting input to WAV format..." | |
if ! ffmpeg -i "$INPUT_FILE" -ar 16000 -ac 1 -c:a pcm_s16le ${DURATION:+-t "$DURATION"} -y "$WAV_FILE"; then | |
echo "Error: Failed to convert input to WAV format." >&2 | |
exit 1 | |
fi | |
else | |
WAV_FILE="$INPUT_FILE" | |
fi | |
echo "Running Whisper-CPP to generate VTT transcript..." | |
if ! "$WHISPCC"/main -ovtt -tdrz -m "$MODEL_PATH" "$WAV_FILE" ; then | |
echo "Error: Whisper-CPP transcription failed." >&2 | |
exit 1 | |
fi | |
VTT_FILE="${WAV_FILE%.wav}.vtt" | |
mv "${WAV_FILE}.vtt" "$VTT_FILE" | |
add_job "process_vtt \"$VTT_FILE\" \"$1\"" | |
if [ "$DISABLE_AUDIO" = false ]; then | |
# Convert to OGG Opus | |
echo "Converting to OGG Opus..." | |
OGG_FILE="${WAV_FILE%.*}.ogg" | |
if ! ffmpeg -i "$WAV_FILE" -c:a libopus -b:a 16k -vbr on -compression_level 10 -y "$OGG_FILE"; then | |
echo "Error: Failed to convert to OGG format." >&2 | |
exit 1 | |
fi | |
echo " - Audio: $OGG_FILE" | |
# Remove the WAV file per CHARTER point 7 | |
rm "$WAV_FILE" | |
fi | |
else | |
echo "Error: Invalid input. Provide a valid URL, VTT file, or a local audio file." | |
exit 1 | |
fi | |
process_job_queue |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import re | |
import datetime | |
import glob | |
import sys | |
def clean_text(text): | |
# Remove HTML tags | |
text = re.sub(r'<[^>]+>', '', text) | |
# Remove multiple spaces | |
text = re.sub(r'\s+', ' ', text) | |
# Remove leading/trailing whitespace | |
return text.strip() | |
def is_prefix(a, b): | |
return b.startswith(a) | |
def process_vtt(content): | |
# Remove WEBVTT header and metadata | |
content = re.sub(r'^WEBVTT\n.*?\n\n', '', content, flags=re.DOTALL) | |
# Split into captions | |
captions = re.split(r'\n\n+', content) | |
processed_captions = [] | |
buffer = [] | |
def flush_buffer(): | |
if buffer: | |
processed_captions.append(buffer[-1]) # Keep the last (most complete) line | |
buffer.clear() | |
for caption in captions: | |
lines = caption.split('\n') | |
if len(lines) >= 2: | |
# Extract only the start time and remove milliseconds | |
timestamp_match = re.match(r'(\d{2}:\d{2}:\d{2})\.(\d{3})', lines[0]) | |
if timestamp_match: | |
timestamp = f"{timestamp_match.group(1)}.{timestamp_match.group(2)}" | |
text = ' '.join(lines[1:]) | |
clean_caption = clean_text(text) | |
if clean_caption: | |
current_line = f"{timestamp} {clean_caption}" | |
if not buffer: | |
buffer.append(current_line) | |
else: | |
_, prev_text = buffer[-1].split(' ', 1) | |
if is_prefix(prev_text, clean_caption): | |
buffer.append(current_line) | |
else: | |
flush_buffer() | |
buffer.append(current_line) | |
flush_buffer() # Don't forget to flush the buffer at the end | |
return '\n'.join(processed_captions) | |
if __name__ == "__main__": | |
try: | |
if len(sys.argv) < 2: | |
print("Usage: python vttclean.py <file_pattern>", file=sys.stderr) | |
sys.exit(1) | |
file_pattern = sys.argv[1] | |
for filename in glob.glob(file_pattern): | |
with open(filename, 'r', encoding='utf-8') as file: | |
content = file.read() | |
result = process_vtt(content) | |
print(result) | |
except Exception as e: | |
print(f"Error processing input: {e}", file=sys.stderr) | |
sys.exit(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment