bioshazard · May 17, 2023 20:01
diff --git a/podcast-summarizer.py b/podcast-summarizer.py
 # WIP attempt at summarizing a podcast given a timestamped transcript
 # https://python.langchain.com/en/latest/modules/chains/index_examples/summarize.html
 import os
 from langchain import OpenAI, PromptTemplate, LLMChain
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.chains.mapreduce import MapReduceChain
 from langchain.prompts import PromptTemplate
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.docstore.document import Document
 from langchain.chains.summarize import load_summarize_chain

 llm = OpenAI(temperature=0)

 transcript_file = os.getenv("TRANSCRIPT_FILE")
 with open(transcript_file, encoding="utf-8") as f:
    transcript_text = f.read()

 text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 3000,
    chunk_overlap  = 200,
    length_function = len,
 )

 docs = text_splitter.create_documents([transcript_text])
 print("Docs array length:", len(docs))

 summary_and_bullets = "\n\n\n".join([
    "{text}",
    "GIVEN THE ABOVE, PROVIDE A SUMMARY AND EXTRACT A BULLET LIST OF 2-WORD-MAXIMUM CATCHY TOPIC WITH ASSOCIATED TIMESTAMP:"
 ])
 PROMPTsab = PromptTemplate(template=summary_and_bullets, input_variables=["text"])

 podcast_description = "\n\n\n".join([
    "{text}",
    "GIVEN THE ABOVE SUMMARIES AND TIMESTAMP BULLETS, PROVIDE A VERY CONCISE CATCHY PODCAST EPISODE SUMMARY AND A BULLET LIST OF THE CATCHIEST 2-WORD-MAXIMUM TOPICS WITH ASSOCIATED TIMESTAMP:"
 ])
 PROMPTpd = PromptTemplate(template=podcast_description, input_variables=["text"])

 chain = load_summarize_chain(
    llm,
    chain_type="map_reduce", 
    map_prompt=PROMPTsab, 
    combine_prompt=PROMPTpd
 )
 result = chain.run(docs)

 print(result)
	# WIP attempt at summarizing a podcast given a timestamped transcript
	# https://python.langchain.com/en/latest/modules/chains/index_examples/summarize.html
	import os
	from langchain import OpenAI, PromptTemplate, LLMChain
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.chains.mapreduce import MapReduceChain
	from langchain.prompts import PromptTemplate
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.docstore.document import Document
	from langchain.chains.summarize import load_summarize_chain

	llm = OpenAI(temperature=0)

	transcript_file = os.getenv("TRANSCRIPT_FILE")
	with open(transcript_file, encoding="utf-8") as f:
	transcript_text = f.read()

	text_splitter = RecursiveCharacterTextSplitter(
	# Set a really small chunk size, just to show.
	chunk_size = 3000,
	chunk_overlap = 200,
	length_function = len,
	)

	docs = text_splitter.create_documents([transcript_text])
	print("Docs array length:", len(docs))

	summary_and_bullets = "\n\n\n".join([
	"{text}",
	"GIVEN THE ABOVE, PROVIDE A SUMMARY AND EXTRACT A BULLET LIST OF 2-WORD-MAXIMUM CATCHY TOPIC WITH ASSOCIATED TIMESTAMP:"
	])
	PROMPTsab = PromptTemplate(template=summary_and_bullets, input_variables=["text"])

	podcast_description = "\n\n\n".join([
	"{text}",
	"GIVEN THE ABOVE SUMMARIES AND TIMESTAMP BULLETS, PROVIDE A VERY CONCISE CATCHY PODCAST EPISODE SUMMARY AND A BULLET LIST OF THE CATCHIEST 2-WORD-MAXIMUM TOPICS WITH ASSOCIATED TIMESTAMP:"
	])
	PROMPTpd = PromptTemplate(template=podcast_description, input_variables=["text"])

	chain = load_summarize_chain(
	llm,
	chain_type="map_reduce",
	map_prompt=PROMPTsab,
	combine_prompt=PROMPTpd
	)
	result = chain.run(docs)

	print(result)