Last active
May 25, 2023 04:15
-
-
Save Sh1n0g1/d69db6dbc5c13ce887c23c6828658570 to your computer and use it in GitHub Desktop.
youtube-summarizer-with-langchain-chatgpt.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/Sh1n0g1/d69db6dbc5c13ce887c23c6828658570/youtube-summarizer-with-langchain-chatgpt.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# How to use\n", | |
"* Input OpenAI API Key\n", | |
"* Input YouTube Video ID\n", | |
" * You can get the Video ID from the URL\n", | |
" * If URL is https://www.youtube.com/watch?v=PlQ4Y8knqvA, then the ID i s `PlQ4Y8knqvA`\n", | |
"* Configure the caption language (Default: en, ja)\n" | |
], | |
"metadata": { | |
"id": "N19SrdFqDsGv" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title 1. Settings\n", | |
"openai_api_key = \"\" #@param {type:\"string\"}\n", | |
"youtube_video_id = \"\" #@param {type:\"string\"}\n", | |
"#@markdown Specify the Language separated by comma (e.g. `en, ja`)\n", | |
"acceptable_caption_languages = \"en , ja \" #@param {type:\"string\"}\n", | |
"#@markdown If you want to get extra info about summarization, enable this\n", | |
"verbose = True #@param {type:\"boolean\"}\n", | |
"summary_language = \"Japanese\" #@param {type:\"string\"}\n", | |
"if summary_language==\"\":\n", | |
" summary_language=\"English\"\n", | |
"\n", | |
"languages=[]\n", | |
"for l in acceptable_caption_languages.split(\",\"):\n", | |
" languages.append(l.strip())\n", | |
"if len(languages)==0:\n", | |
" languages=[\"en\"]\n", | |
"\n", | |
" " | |
], | |
"metadata": { | |
"cellView": "form", | |
"id": "rlRFCK440YpK" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "RRYSu48huSUW", | |
"cellView": "form" | |
}, | |
"outputs": [], | |
"source": [ | |
"#@title 2. Install Module \n", | |
"!pip -q install langchain\n", | |
"!pip -q install openai\n", | |
"!pip -q install tiktoken\n", | |
"!pip -q install youtube-transcript-api\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title 3. Import Module\n", | |
"import os\n", | |
"import textwrap\n", | |
"\n", | |
"from langchain import OpenAI, PromptTemplate, LLMChain\n", | |
"from langchain.text_splitter import CharacterTextSplitter\n", | |
"from langchain.chains.mapreduce import MapReduceChain\n", | |
"from langchain.prompts import PromptTemplate\n", | |
"from langchain.docstore.document import Document\n", | |
"from langchain.chains.summarize import load_summarize_chain\n", | |
"from langchain.callbacks import get_openai_callback\n", | |
"import tiktoken\n", | |
"\n", | |
"from youtube_transcript_api import YouTubeTranscriptApi\n" | |
], | |
"metadata": { | |
"cellView": "form", | |
"id": "JBHj00kg0Gqh" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title 4. Get Youtube Caption\n", | |
"youtube_caption = YouTubeTranscriptApi.get_transcript(youtube_video_id, languages)\n", | |
"caption=\"\"\n", | |
"previous_caption_end_time=0\n", | |
"for c in youtube_caption:\n", | |
" space=c['start'] - previous_caption_end_time\n", | |
" if space > 3:\n", | |
" caption+=c['text'] + ',\\n'\n", | |
" else:\n", | |
" caption+=c['text'] + ' '\n", | |
" previous_caption_end_time = c['start'] \n", | |
"print(f\"{caption}\\n{'='*20}\")\n", | |
"print(f\"Length of Text:{len(caption)}\")\n", | |
"encoding = tiktoken.get_encoding(\"cl100k_base\")\n", | |
"print(f\"Tokens: {len(encoding.encode(caption))}\")\n", | |
"\n", | |
"text_splitter = CharacterTextSplitter(chunk_size=1000, separator=\"\\n\")\n", | |
"texts = text_splitter.split_text(caption)\n", | |
"print(f\"{len(texts)} chunks created.\")\n", | |
"docs = [Document(page_content=t) for t in texts]" | |
], | |
"metadata": { | |
"cellView": "form", | |
"id": "RzL3WQro0TOg" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "M5uYr_-TPKgR" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title 5. Use a Custom Prompt (Optional)\n", | |
"map_prompt_template =\"\"\"Write a consise summary of the following YouTube transcription:\n", | |
"###\n", | |
"{text}\n", | |
"\n", | |
"SUMMARY:\n", | |
"\n", | |
"\"\"\"\n", | |
"\n", | |
"final_prompt_template = \"Write a bullet point summary of the following in \" + summary_language + \"\"\":\n", | |
"###\n", | |
"{text}\n", | |
"\n", | |
"SUMMARY:\"\"\"\n", | |
"\n", | |
"map_prompt = PromptTemplate(template=map_prompt_template, \n", | |
" input_variables=[\"text\"])\n", | |
"final_prompt = PromptTemplate(template=final_prompt_template, \n", | |
" input_variables=[\"text\"])" | |
], | |
"metadata": { | |
"id": "YiDNATpDHdsz" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title 6. Get Summary\n", | |
"os.environ[\"OPENAI_API_KEY\"] = openai_api_key\n", | |
"llm = OpenAI(temperature=0)\n", | |
"with get_openai_callback() as cb:\n", | |
" if use_custom_prompt:\n", | |
" chain = load_summarize_chain(llm, chain_type=\"map_reduce\", verbose=verbose, map_prompt=map_prompt, combine_prompt=final_prompt)\n", | |
" else:\n", | |
" chain = load_summarize_chain(llm, chain_type=\"map_reduce\", verbose=verbose)\n", | |
" output_summary = chain.run(docs)\n", | |
"\n", | |
" \n", | |
" \n", | |
" print(cb)\n" | |
], | |
"metadata": { | |
"id": "h4nRP8oGO2qf" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title 7. Print Summary\n", | |
"\n", | |
"for t in output_summary.split(\"\\n\"):\n", | |
" print(t)" | |
], | |
"metadata": { | |
"id": "nbW7Yqp4OAlm" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment