luebken · April 22, 2023 14:32
diff --git a/fetch_subtitles.sh b/fetch_subtitles.sh
 #!/bin/bash

 set -eu

 FOLDER=$1
 PLAYLIST=$2

 rm -f playlist.txt
 mkdir -p $FOLDER
 yt-dlp --flat-playlist -i --print-to-file url playlist.txt $PLAYLIST

 for i in $(cat playlist.txt)
 do
    FILENAME=$(yt-dlp --get-title --skip-download "$i" | tr -s '[[:space:]]' '_').content

    if [ -f $FOLDER/$FILENAME ]; then
        continue
    fi

    rm -rf tmp
    mkdir -p tmp
    cd tmp

    # fetch subtitle
    yt-dlp --skip-download \
        --sub-lang en-orig \
        --write-auto-sub \
        "$i"

    if [ -f *.vtt ]; then
        # convert subtitle
        for j in *.vtt
        do
            vtt2text "$j"
        done

        # get title and description
        yt-dlp --get-title --get-description --skip-download "$i" > $FILENAME
        cat *.txt >> $FILENAME
        
        mv $FILENAME ../$FOLDER/$FILENAME
    fi

    cd ..
 done
diff --git a/train-with-subtitles.py b/train-with-subtitles.py
 import os
 import logging
 import sys
 import textwrap

 from llama_index import (
    GPTKeywordTableIndex,
    Document,
    SimpleDirectoryReader,
    LLMPredictor,
 )
 from langchain import OpenAI


 if __name__ == "__main__":
    logging.basicConfig(stream=sys.stdout, level=logging.CRITICAL)
    logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

    if not os.path.exists("index.json"):
        subtitles_folder = sys.argv[1]
        documents = SimpleDirectoryReader(subtitles_folder).load_data()
        llm_predictor = LLMPredictor(
            llm=OpenAI(temperature=0,
            model_name="text-davinci-003",
            max_tokens=2048)
        )
        index = GPTKeywordTableIndex(documents, llm_predictor=llm_predictor)
        index.save_to_disk("index.json")
    else:
        index = GPTKeywordTableIndex.load_from_disk("index.json")

    while True:
        try:
            prompt = input("What should I figure out? ")
            response = index.query(prompt)
            response = str(response).strip()
            if not response:
                continue
            for line in textwrap.wrap(response, width=75):
                print(line)
            print("-----")
        except KeyboardInterrupt:
            break
	#!/bin/bash

	set -eu

	FOLDER=$1
	PLAYLIST=$2

	rm -f playlist.txt
	mkdir -p $FOLDER
	yt-dlp --flat-playlist -i --print-to-file url playlist.txt $PLAYLIST

	for i in $(cat playlist.txt)
	do
	FILENAME=$(yt-dlp --get-title --skip-download "$i" \| tr -s '[[:space:]]' '_').content

	if [ -f $FOLDER/$FILENAME ]; then
	continue
	fi

	rm -rf tmp
	mkdir -p tmp
	cd tmp

	# fetch subtitle
	yt-dlp --skip-download \
	--sub-lang en-orig \
	--write-auto-sub \
	"$i"

	if [ -f *.vtt ]; then
	# convert subtitle
	for j in *.vtt
	do
	vtt2text "$j"
	done

	# get title and description
	yt-dlp --get-title --get-description --skip-download "$i" > $FILENAME
	cat *.txt >> $FILENAME

	mv $FILENAME ../$FOLDER/$FILENAME
	fi

	cd ..
	done
	import os
	import logging
	import sys
	import textwrap

	from llama_index import (
	GPTKeywordTableIndex,
	Document,
	SimpleDirectoryReader,
	LLMPredictor,
	)
	from langchain import OpenAI


	if __name__ == "__main__":
	logging.basicConfig(stream=sys.stdout, level=logging.CRITICAL)
	logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

	if not os.path.exists("index.json"):
	subtitles_folder = sys.argv[1]
	documents = SimpleDirectoryReader(subtitles_folder).load_data()
	llm_predictor = LLMPredictor(
	llm=OpenAI(temperature=0,
	model_name="text-davinci-003",
	max_tokens=2048)
	)
	index = GPTKeywordTableIndex(documents, llm_predictor=llm_predictor)
	index.save_to_disk("index.json")
	else:
	index = GPTKeywordTableIndex.load_from_disk("index.json")

	while True:
	try:
	prompt = input("What should I figure out? ")
	response = index.query(prompt)
	response = str(response).strip()
	if not response:
	continue
	for line in textwrap.wrap(response, width=75):
	print(line)
	print("-----")
	except KeyboardInterrupt:
	break