Last active
November 9, 2024 23:51
-
-
Save Bluscream/00cba0c357adc5e7ecfc6234759f4be2 to your computer and use it in GitHub Desktop.
Youtube Subtitles fetcher
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install requests bbpb | |
import base64, json | |
import requests, blackboxprotobuf | |
from typing import Dict, Any | |
class YouTubeTranscriptDownloader: | |
"""Class for downloading youtube transcripts | |
""" | |
url = 'https://www.youtube.com/youtubei/v1/get_transcript' | |
headers = { | |
'Content-Type': 'application/json' | |
} | |
context = { | |
'client': { | |
'clientName': 'WEB', | |
'clientVersion': '2.20240313' | |
} | |
} | |
@staticmethod | |
def encode_protobuf(message: Dict[str, Any], typedef: Dict[str, Dict[str, str]]) -> str: | |
"""Encode a given message dict with a given typedef dict to base64-encoded protobuf | |
Args: | |
message (Dict[str, Any]): Message dictionary to encode | |
typedef (Dict[str, Dict[str, str]]): Clone of the message dictionary, but instead of values it has types as strings | |
Returns: | |
str: base64 encoded protobuf message | |
""" | |
data = blackboxprotobuf.encode_message(message, typedef) | |
return base64.b64encode(data).decode('ascii') | |
def get(self, videoId: str, lang = "en", automatic = True): | |
"""Gets a transcript in youtube's proprietary json form for a video by it's ID | |
Args: | |
videoId (str): Youtube Video ID | |
lang (str, optional): 2 Letter language code. Defaults to "en". | |
automatic (bool, optional): Wether to get the automatically generated captions. Defaults to True | |
Returns: | |
_type_: Transcript as json dict | |
""" | |
lang_dict = { '1': 'asr', '2': lang} if automatic else { '2': lang} | |
lang_type = { '1': { 'type': 'string' },'2': { 'type': 'string' } } if automatic else { '2': { 'type': 'string' } } | |
message = { | |
'1': videoId, | |
'2': YouTubeTranscriptDownloader.encode_protobuf(lang_dict, lang_type), | |
} | |
msg_type = { '1': { 'type': 'string' }, '2': { 'type': 'string' } } | |
params = YouTubeTranscriptDownloader.encode_protobuf(message, msg_type) | |
data = { | |
'context': self.context, | |
'params': params | |
} | |
print("Getting transcript for video",videoId,"in language",lang,"(automatic)" if automatic else "") | |
data = requests.post(self.url, headers = self.headers, json = data).json() | |
return data | |
if __name__ == "__main__": # Example usage | |
from sys import argv | |
downloader = YouTubeTranscriptDownloader() | |
videoId = argv[-1] if len(argv) > 1 else input("Video ID:") | |
lang = input("Language (2 letter lowercase):") or "en" | |
automatic = True if input("Automaticly generated? (empty=No)") else False | |
result = downloader.get(videoId, lang, automatic) | |
if result: | |
txt = json.dumps(result, indent=4) | |
print(txt) | |
with open("transcript.json", 'w') as f: | |
# Writing data to a file | |
f.write(txt) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"$schema": "http://json-schema.org/draft-06/schema#", | |
"$ref": "#/definitions/TranscriptResponse", | |
"definitions": { | |
"TranscriptResponse": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"responseContext": { | |
"$ref": "#/definitions/ResponseContext" | |
}, | |
"actions": { | |
"type": "array", | |
"items": { | |
"$ref": "#/definitions/Action" | |
} | |
}, | |
"trackingParams": { | |
"type": "string" | |
} | |
}, | |
"required": [], | |
"title": "TranscriptResponse" | |
}, | |
"Action": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"clickTrackingParams": { | |
"type": "string" | |
}, | |
"updateEngagementPanelAction": { | |
"$ref": "#/definitions/UpdateEngagementPanelAction" | |
} | |
}, | |
"required": [], | |
"title": "Action" | |
}, | |
"UpdateEngagementPanelAction": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"targetId": { | |
"type": "string" | |
}, | |
"content": { | |
"$ref": "#/definitions/UpdateEngagementPanelActionContent" | |
} | |
}, | |
"required": [], | |
"title": "UpdateEngagementPanelAction" | |
}, | |
"UpdateEngagementPanelActionContent": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"transcriptRenderer": { | |
"$ref": "#/definitions/TranscriptRenderer" | |
} | |
}, | |
"required": [], | |
"title": "UpdateEngagementPanelActionContent" | |
}, | |
"TranscriptRenderer": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"trackingParams": { | |
"type": "string" | |
}, | |
"content": { | |
"$ref": "#/definitions/TranscriptRendererContent" | |
} | |
}, | |
"required": [], | |
"title": "TranscriptRenderer" | |
}, | |
"TranscriptRendererContent": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"transcriptSearchPanelRenderer": { | |
"$ref": "#/definitions/TranscriptSearchPanelRenderer" | |
} | |
}, | |
"required": [], | |
"title": "TranscriptRendererContent" | |
}, | |
"TranscriptSearchPanelRenderer": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"header": { | |
"$ref": "#/definitions/Header" | |
}, | |
"body": { | |
"$ref": "#/definitions/Body" | |
}, | |
"footer": { | |
"$ref": "#/definitions/Footer" | |
}, | |
"trackingParams": { | |
"type": "string" | |
}, | |
"targetId": { | |
"type": "string" | |
} | |
}, | |
"required": [], | |
"title": "TranscriptSearchPanelRenderer" | |
}, | |
"Body": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"transcriptSegmentListRenderer": { | |
"$ref": "#/definitions/TranscriptSegmentListRenderer" | |
} | |
}, | |
"required": [], | |
"title": "Body" | |
}, | |
"TranscriptSegmentListRenderer": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"initialSegments": { | |
"type": "array", | |
"items": { | |
"$ref": "#/definitions/InitialSegment" | |
} | |
}, | |
"noResultLabel": { | |
"$ref": "#/definitions/NoResultLabel" | |
}, | |
"retryLabel": { | |
"$ref": "#/definitions/NoResultLabel" | |
}, | |
"touchCaptionsEnabled": { | |
"type": "boolean" | |
} | |
}, | |
"required": [], | |
"title": "TranscriptSegmentListRenderer" | |
}, | |
"InitialSegment": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"transcriptSegmentRenderer": { | |
"$ref": "#/definitions/TranscriptSegmentRenderer" | |
} | |
}, | |
"required": [], | |
"title": "InitialSegment" | |
}, | |
"TranscriptSegmentRenderer": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"startMs": { | |
"type": "string", | |
"format": "integer" | |
}, | |
"endMs": { | |
"type": "string", | |
"format": "integer" | |
}, | |
"snippet": { | |
"$ref": "#/definitions/NoResultLabel" | |
}, | |
"startTimeText": { | |
"$ref": "#/definitions/StartTimeText" | |
}, | |
"trackingParams": { | |
"type": "string" | |
}, | |
"accessibility": { | |
"$ref": "#/definitions/Accessibility" | |
}, | |
"targetId": { | |
"type": "string" | |
} | |
}, | |
"required": [], | |
"title": "TranscriptSegmentRenderer" | |
}, | |
"Accessibility": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"accessibilityData": { | |
"$ref": "#/definitions/AccessibilityData" | |
} | |
}, | |
"required": [], | |
"title": "Accessibility" | |
}, | |
"AccessibilityData": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"label": { | |
"type": "string" | |
} | |
}, | |
"required": [], | |
"title": "AccessibilityData" | |
}, | |
"NoResultLabel": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"runs": { | |
"type": "array", | |
"items": { | |
"$ref": "#/definitions/Run" | |
} | |
} | |
}, | |
"required": [], | |
"title": "NoResultLabel" | |
}, | |
"Run": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"text": { | |
"type": "string" | |
} | |
}, | |
"required": [], | |
"title": "Run" | |
}, | |
"StartTimeText": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"simpleText": { | |
"type": "string" | |
} | |
}, | |
"required": [], | |
"title": "StartTimeText" | |
}, | |
"Footer": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"transcriptFooterRenderer": { | |
"$ref": "#/definitions/TranscriptFooterRenderer" | |
} | |
}, | |
"required": [], | |
"title": "Footer" | |
}, | |
"TranscriptFooterRenderer": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"languageMenu": { | |
"$ref": "#/definitions/LanguageMenu" | |
} | |
}, | |
"required": [], | |
"title": "TranscriptFooterRenderer" | |
}, | |
"LanguageMenu": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"sortFilterSubMenuRenderer": { | |
"$ref": "#/definitions/SortFilterSubMenuRenderer" | |
} | |
}, | |
"required": [], | |
"title": "LanguageMenu" | |
}, | |
"SortFilterSubMenuRenderer": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"subMenuItems": { | |
"type": "array", | |
"items": { | |
"$ref": "#/definitions/SubMenuItem" | |
} | |
}, | |
"trackingParams": { | |
"type": "string" | |
} | |
}, | |
"required": [], | |
"title": "SortFilterSubMenuRenderer" | |
}, | |
"SubMenuItem": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"title": { | |
"type": "string" | |
}, | |
"selected": { | |
"type": "boolean" | |
}, | |
"continuation": { | |
"$ref": "#/definitions/Continuation" | |
}, | |
"trackingParams": { | |
"type": "string" | |
} | |
}, | |
"required": [], | |
"title": "SubMenuItem" | |
}, | |
"Continuation": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"reloadContinuationData": { | |
"$ref": "#/definitions/ReloadContinuationData" | |
} | |
}, | |
"required": [], | |
"title": "Continuation" | |
}, | |
"ReloadContinuationData": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"continuation": { | |
"type": "string" | |
}, | |
"clickTrackingParams": { | |
"type": "string" | |
} | |
}, | |
"required": [], | |
"title": "ReloadContinuationData" | |
}, | |
"Header": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"transcriptSearchBoxRenderer": { | |
"$ref": "#/definitions/TranscriptSearchBoxRenderer" | |
} | |
}, | |
"required": [], | |
"title": "Header" | |
}, | |
"TranscriptSearchBoxRenderer": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"formattedPlaceholder": { | |
"$ref": "#/definitions/NoResultLabel" | |
}, | |
"accessibility": { | |
"$ref": "#/definitions/Accessibility" | |
}, | |
"clearButton": { | |
"$ref": "#/definitions/ClearButton" | |
}, | |
"onTextChangeCommand": { | |
"$ref": "#/definitions/OnTextChangeCommand" | |
}, | |
"trackingParams": { | |
"type": "string" | |
}, | |
"searchButton": { | |
"$ref": "#/definitions/SearchButton" | |
} | |
}, | |
"required": [], | |
"title": "TranscriptSearchBoxRenderer" | |
}, | |
"ClearButton": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"buttonRenderer": { | |
"$ref": "#/definitions/ClearButtonButtonRenderer" | |
} | |
}, | |
"required": [], | |
"title": "ClearButton" | |
}, | |
"ClearButtonButtonRenderer": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"icon": { | |
"$ref": "#/definitions/Icon" | |
}, | |
"trackingParams": { | |
"type": "string" | |
}, | |
"accessibilityData": { | |
"$ref": "#/definitions/Accessibility" | |
} | |
}, | |
"required": [], | |
"title": "ClearButtonButtonRenderer" | |
}, | |
"Icon": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"iconType": { | |
"type": "string" | |
} | |
}, | |
"required": [], | |
"title": "Icon" | |
}, | |
"OnTextChangeCommand": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"clickTrackingParams": { | |
"type": "string" | |
}, | |
"commandMetadata": { | |
"$ref": "#/definitions/CommandMetadata" | |
}, | |
"getTranscriptEndpoint": { | |
"$ref": "#/definitions/GetTranscriptEndpoint" | |
} | |
}, | |
"required": [], | |
"title": "OnTextChangeCommand" | |
}, | |
"CommandMetadata": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"webCommandMetadata": { | |
"$ref": "#/definitions/WebCommandMetadata" | |
} | |
}, | |
"required": [], | |
"title": "CommandMetadata" | |
}, | |
"WebCommandMetadata": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"sendPost": { | |
"type": "boolean" | |
}, | |
"apiUrl": { | |
"type": "string" | |
} | |
}, | |
"required": [], | |
"title": "WebCommandMetadata" | |
}, | |
"GetTranscriptEndpoint": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"params": { | |
"type": "string" | |
} | |
}, | |
"required": [], | |
"title": "GetTranscriptEndpoint" | |
}, | |
"SearchButton": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"buttonRenderer": { | |
"$ref": "#/definitions/SearchButtonButtonRenderer" | |
} | |
}, | |
"required": [], | |
"title": "SearchButton" | |
}, | |
"SearchButtonButtonRenderer": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"trackingParams": { | |
"type": "string" | |
} | |
}, | |
"required": [], | |
"title": "SearchButtonButtonRenderer" | |
}, | |
"ResponseContext": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"visitorData": { | |
"type": "string" | |
}, | |
"serviceTrackingParams": { | |
"type": "array", | |
"items": { | |
"$ref": "#/definitions/ServiceTrackingParam" | |
} | |
}, | |
"mainAppWebResponseContext": { | |
"$ref": "#/definitions/MainAppWebResponseContext" | |
}, | |
"webResponseContextExtensionData": { | |
"$ref": "#/definitions/WebResponseContextExtensionData" | |
} | |
}, | |
"required": [], | |
"title": "ResponseContext" | |
}, | |
"MainAppWebResponseContext": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"loggedOut": { | |
"type": "boolean" | |
}, | |
"trackingParam": { | |
"type": "string" | |
} | |
}, | |
"required": [], | |
"title": "MainAppWebResponseContext" | |
}, | |
"ServiceTrackingParam": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"service": { | |
"type": "string" | |
}, | |
"params": { | |
"type": "array", | |
"items": { | |
"$ref": "#/definitions/Param" | |
} | |
} | |
}, | |
"required": [], | |
"title": "ServiceTrackingParam" | |
}, | |
"Param": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"key": { | |
"type": "string" | |
}, | |
"value": { | |
"type": "string" | |
} | |
}, | |
"required": [], | |
"title": "Param" | |
}, | |
"WebResponseContextExtensionData": { | |
"type": "object", | |
"additionalProperties": false, | |
"properties": { | |
"hasDecorated": { | |
"type": "boolean" | |
} | |
}, | |
"required": [], | |
"title": "WebResponseContextExtensionData" | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
When executed inside a virtual env getting below error
From .../lib/python3.12/site-packages/blackboxprotobuf/lib/types/length_delim.py", line 56, in encode_message
if info['name'] == field_number and field_number != '':
~~~~^^^^^^^^
KeyError: 'name'