Last active
May 8, 2024 08:22
-
-
Save hippietrail/af26d363d7f6ac37565c5074f04e4f5e to your computer and use it in GitHub Desktop.
TypeScript code to fetch one or more YouTube transcripts as plain text without API key
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import url from 'url'; | |
import parse from 'html-dom-parser'; | |
import { Element, Text } from 'domhandler'; | |
import { decodeXML } from 'entities'; | |
async function getHtmlByVideoID(videoID: string): Promise<string> { | |
const reponse = await fetch(url.format({ | |
protocol: 'https', | |
hostname: 'www.youtube.com', | |
pathname: 'watch', | |
query: { v: videoID }, | |
})); | |
return (await reponse.text()); | |
} | |
async function main() { | |
const videoIDsFromCommandline = process.argv.slice(2); | |
let videoIDs = videoIDsFromCommandline; | |
let videoIDsToRetry: string[] = []; | |
while (videoIDs.length > 0) { | |
const promiseArray = videoIDs.map(getHtmlByVideoID); | |
const settledPromises = await Promise.allSettled(promiseArray); | |
for (const [settledNum, settled] of settledPromises.entries()) { | |
const videoID = `${videoIDs[settledNum]}`; | |
if (settled.status === 'fulfilled') { | |
if (settled.value) { | |
const dom = parse(settled.value); | |
const data = ((((dom[1] as Element).children[1] as Element).children[0] as Element).children[0] as Text).data; | |
const startsWith = 'var ytInitialPlayerResponse = '; | |
if (data.includes(startsWith) && data.endsWith(';')) { | |
const json = JSON.parse(data.substring(startsWith.length, data.length - 1)); | |
if (!('captions' in json)) { | |
console.error('no captions', videoID); | |
} else { | |
const xml = await (await fetch(json.captions.playerCaptionsTracklistRenderer.captionTracks[0].baseUrl)).text(); | |
const dom = parse(xml); | |
for (const dn of (dom[1] as Element).children) | |
console.log(decodeXML(((dn as Element).children[0] as Text).data)); | |
} | |
} | |
} | |
} else if (settled.status === 'rejected') { | |
console.error(`rejected ${videoID}`, settled.reason); | |
videoIDsToRetry.push(videoID); | |
} | |
} | |
videoIDs = videoIDsToRetry; | |
videoIDsToRetry = []; | |
} | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment