Created
December 1, 2023 17:19
-
-
Save ritwikraha/76ba0856b8c73b62083869140adca6af to your computer and use it in GitHub Desktop.
yt-transcript.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"authorship_tag": "ABX9TyPhPAYMRz95/kK9H7CkatAx", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/ritwikraha/76ba0856b8c73b62083869140adca6af/yt-transcript.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "xjUUWhux7w05", | |
"outputId": "f5ec4efb-a5d0-4bc5-8757-86c954374735" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Collecting youtube_transcript_api\n", | |
" Downloading youtube_transcript_api-0.6.1-py3-none-any.whl (24 kB)\n", | |
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from youtube_transcript_api) (2.31.0)\n", | |
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->youtube_transcript_api) (3.3.2)\n", | |
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->youtube_transcript_api) (3.4)\n", | |
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->youtube_transcript_api) (2.0.7)\n", | |
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->youtube_transcript_api) (2023.7.22)\n", | |
"Installing collected packages: youtube_transcript_api\n", | |
"Successfully installed youtube_transcript_api-0.6.1\n" | |
] | |
} | |
], | |
"source": [ | |
"!pip install youtube_transcript_api" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Importing the YouTubeTranscriptApi module from the youtube_transcript_api package\n", | |
"from youtube_transcript_api import YouTubeTranscriptApi" | |
], | |
"metadata": { | |
"id": "bSTqToCf8jmp" | |
}, | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import googleapiclient.discovery" | |
], | |
"metadata": { | |
"id": "THQfl9IzV44p" | |
}, | |
"execution_count": 3, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"\n", | |
"# Create a YouTube API service object.\n", | |
"youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey='API-KEY')\n", | |
"\n", | |
"# Get the playlist ID.\n", | |
"playlist_id = 'PLJicmE8fK0EiFRt1Hm5a_7SJFaikIFW30'\n", | |
"\n", | |
"# Call the playlistItems.list() method.\n", | |
"request = youtube.playlistItems().list(part='snippet', playlistId=playlist_id)\n", | |
"\n", | |
"# Get the response.\n", | |
"response = request.execute()" | |
], | |
"metadata": { | |
"id": "RjKLxajJWK3b" | |
}, | |
"execution_count": 6, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Get the video IDs.\n", | |
"video_ids = []\n", | |
"for item in response['items']:\n", | |
" video_ids.append(item['snippet']['resourceId']['videoId'])" | |
], | |
"metadata": { | |
"id": "Yewhvw21XuH1" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Print the video IDs.\n", | |
"print(video_ids)" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "o7_nFEn2VjTc", | |
"outputId": "e25fb659-9703-480c-b29e-4f24ce0817ea" | |
}, | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"['N5vJSNXPEwA', '7yDmGnA8Hw0', '98TQv5IAtY8', 'mmkCS5eA4f8', 'LKvjIsyYng8']\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Defining a function to generate a transcript for a given YouTube video ID\n", | |
"def generate_transcript(id):\n", | |
" # Retrieving the transcript of the video using the YouTubeTranscriptApi\n", | |
" transcript = YouTubeTranscriptApi.get_transcript(id)\n", | |
"\n", | |
" # Initializing an empty string to store the cleaned transcript\n", | |
" script = \"\"\n", | |
"\n", | |
" # Iterating through each segment of the transcript\n", | |
" for text in transcript:\n", | |
" # Extracting the text part of the current segment\n", | |
" t = text[\"text\"]\n", | |
"\n", | |
" # Ignoring segments labeled as '[Music]'\n", | |
" if t != '[Music]':\n", | |
" # Adding the text segment to the script with a space after each segment\n", | |
" script += t + \" \"\n", | |
"\n", | |
" # Returning the script and the number of words in the script\n", | |
" return script, len(script.split())\n" | |
], | |
"metadata": { | |
"id": "QmqJkacT78_X" | |
}, | |
"execution_count": 12, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import csv\n", | |
"\n", | |
"# Path to the CSV file\n", | |
"csv_file_path = 'transcripts.csv'\n", | |
"\n", | |
"# Open the CSV file in write mode\n", | |
"with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:\n", | |
" # Create a CSV writer\n", | |
" writer = csv.writer(file)\n", | |
"\n", | |
" # Write the header\n", | |
" writer.writerow(['video_id', 'raw_text'])\n", | |
"\n", | |
" # Loop through each video ID\n", | |
" for id in video_ids:\n", | |
" # Call the generate_transcript function and store the output\n", | |
" transcript, no_of_words = generate_transcript(id)\n", | |
"\n", | |
" # Write the video ID and transcript to the CSV file\n", | |
" writer.writerow([id, transcript])\n", | |
"\n", | |
"# Inform the user that the process is complete\n", | |
"print(\"CSV file has been created with transcripts.\")\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "0bzBtdbGYHVX", | |
"outputId": "a591b152-3f62-4a78-ef9f-5e11e2539316" | |
}, | |
"execution_count": 13, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"CSV file has been created with transcripts.\n" | |
] | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment