Skip to content

Instantly share code, notes, and snippets.

@knzm
Created November 26, 2020 00:08
Show Gist options
  • Save knzm/1cb560f9cbc8ea15a941836d04a4f5d5 to your computer and use it in GitHub Desktop.
Save knzm/1cb560f9cbc8ea15a941836d04a4f5d5 to your computer and use it in GitHub Desktop.
const fetch = require('node-fetch')
const { JSDOM } = require('jsdom')
const nodeEval = require('node-eval')
const xmlParser = require('fast-xml-parser')
const DEFAULT_LANG = 'ja'
function extractTracksFromHTML(body) {
const dom = new JSDOM(body)
const context = {
window: {},
}
Array.from(dom.window.document.querySelectorAll('body script'))
.map(node => node.textContent)
.filter(text => text.trim().startsWith('window["'))
.forEach(text => {
// console.log(text)
nodeEval(text, 'watch.js', context)
})
const captions = context.window.ytInitialPlayerResponse.captions
const tracks = captions.playerCaptionsTracklistRenderer.captionTracks
return tracks
}
function selectTrack(tracks, lang) {
const candidates = {}
tracks.forEach(track => {
if (track.languageCode === lang) {
const key = ( track.kind === 'asr' )
? 'generated'
: 'manual'
candidates[key] = track
}
})
const track = candidates.manual || candidates.generated
return track
}
function extractTextsFromXML(body) {
const data = xmlParser.parse(body)
const texts = data.transcript.text
return texts
}
function main() {
const videoId = process.argv[2]
const lang = process.argv[3] || DEFAULT_LANG
fetch(`https://www.youtube.com/watch?v=${videoId}`)
.then(res => res.text())
.then(body => {
const tracks = extractTracksFromHTML(body)
const track = selectTrack(tracks, lang)
fetch(track.baseUrl)
.then(res => res.text())
.then(body => {
const texts = extractTextsFromXML(body)
texts.forEach(text => console.log(text))
})
})
}
if (require.main === module) {
main()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment