Last active
August 30, 2023 05:58
-
-
Save sanjarcode/0ecc4fc4cb281e1b061d0edf27c89b4c to your computer and use it in GitHub Desktop.
Download Udemy caption links, other Udemy setup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Local server (assumes Node.js is installed) | |
Purpose - the console code sends data to this (local) server, so | |
the transcript files can be saved locally. | |
Add this file to a folder, then run the following command in the folder | |
`npm init -y && npm install express body-parser cors && node app.js` | |
*/ | |
const express = require("express"); | |
const bodyParser = require("body-parser"); | |
const cors = require("cors"); | |
const app = express(); | |
app.use(bodyParser.urlencoded({ extended: false })); | |
app.use(express.json()); | |
app.use(cors()); | |
app.get("/", (req, res, next) => { | |
res.json("Save all server ready"); | |
}); | |
app.post("/save-all", (req, res, next) => { | |
const fs = require("fs"); | |
const dir = "./vtt"; | |
fs.readdir(dir, (err, files) => { | |
console.log(files.length, req.body); | |
if (!req.body) { | |
res.send(req.body ? "Ok" : "Not Ok"); | |
return; | |
} | |
fs.writeFile(`./vtt/${req.body.name}.vtt`, req.body.value, function (err) { | |
if (err) throw err; | |
console.log("Saved!"); | |
res.send(req.body ? "Ok" : "Not Ok"); | |
}); | |
}); | |
}); | |
app.use((req, res, next) => { | |
res | |
.status(404) | |
.render("404", { myActivePath: "404-page", docTitle: "Page Not Found" }); | |
next(); | |
}); | |
app.listen(3000, () => console.log("Save-all server running on port 3000")); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// get caption URL (s) | |
async function getOneCaptionLink( | |
courseNumber = "1879018", | |
lectureNumber = "11738946", | |
languageCode = "en_US", | |
url = "https://www.udemy.com/api-2.0/users/me/subscribed-courses", | |
queryParams = "fields[lecture]=asset,description,download_url,is_free,last_watched_second&fields[asset]=asset_type,length,media_license_token,course_is_drmed,media_sources,captions,thumbnail_sprite,slides,slide_urls,download_urls,external_url" | |
) { | |
url ||= "https://www.udemy.com/api-2.0/users/me/subscribed-courses"; | |
queryParams ||= | |
"fields[lecture]=asset,description,download_url,is_free,last_watched_second&fields[asset]=asset_type,length,media_license_token,course_is_drmed,media_sources,captions,thumbnail_sprite,slides,slide_urls,download_urls,external_url"; | |
const response = await fetch( | |
`${url}/${courseNumber}/lectures/${lectureNumber}?${queryParams}`, | |
{ | |
headers: { | |
accept: "application/json, text/plain, */*", | |
"accept-language": "en-GB", | |
"cache-control": "no-cache", | |
pragma: "no-cache", | |
"sec-ch-ua": | |
'"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"', | |
"sec-ch-ua-mobile": "?0", | |
"sec-ch-ua-platform": '"Linux"', | |
"sec-fetch-dest": "empty", | |
"sec-fetch-mode": "cors", | |
"sec-fetch-site": "same-origin", | |
"x-requested-with": "XMLHttpRequest", | |
"x-udemy-cache-brand": "INen_US", | |
"x-udemy-cache-campaign-code": "ST6OT32123", | |
"x-udemy-cache-device": "None", | |
"x-udemy-cache-language": "en", | |
"x-udemy-cache-logged-in": "1", | |
"x-udemy-cache-marketplace-country": "IN", | |
"x-udemy-cache-price-country": "IN", | |
"x-udemy-cache-release": "f3e5ec9c20da099eb2d2", | |
"x-udemy-cache-user": "56648366", | |
"x-udemy-cache-version": "1", | |
}, | |
referrer: `https://www.udemy.com/course/nodejs-the-complete-guide/learn/lecture/${lectureNumber}`, | |
referrerPolicy: "strict-origin-when-cross-origin", | |
body: null, | |
method: "GET", | |
mode: "cors", | |
credentials: "include", | |
} | |
); | |
const ASSET_KEY = "asset"; | |
const CAPTIONS_KEY = "captions"; | |
const LOCALE_KEY = "locale_id"; | |
const CAPTION_URL_KEY = "url"; | |
const data = await response.json(); | |
/* | |
{ | |
_class: "lecture", | |
id: 11738946, | |
description: "", | |
is_free: false, | |
asset: {}, | |
url: "/course/nodejs-the-complete-guide/learn/lecture/11738946", | |
last_watched_second: 15, | |
download_url: "/api-2.0/users/me/subscribed-courses/1879018/lectures/11738946/" | |
} | |
*/ | |
const captions_for_all_langs = data[ASSET_KEY][CAPTIONS_KEY]; | |
const captionUrl = captions_for_all_langs.find( | |
(item) => item[LOCALE_KEY].toLowerCase() === languageCode.toLowerCase() | |
)?.[CAPTION_URL_KEY]; | |
return captionUrl; | |
} | |
async function getAllCaptionsLinks( | |
courseNumber = "1879018", | |
languageCode = "en_US", | |
page = 1 | |
) { | |
const VIDEO_COUNT_KEY = "count"; | |
let pageSize = 1; | |
let url = `https://www.udemy.com/api-2.0/courses/${courseNumber}/subscriber-curriculum-items/?page=${page}&page_size=${pageSize}&fields[lecture]=title,sort_order,asset&fields[asset]=title,filename,asset_type,time_estimation,captions`; | |
let response = null, | |
data = null; | |
response = await fetch(url, { | |
headers: { | |
accept: | |
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", | |
"accept-language": "en-US,en;q=0.9", | |
"cache-control": "no-cache", | |
pragma: "no-cache", | |
"sec-ch-ua": | |
'"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"', | |
"sec-ch-ua-mobile": "?0", | |
"sec-ch-ua-platform": '"Linux"', | |
"sec-fetch-dest": "document", | |
"sec-fetch-mode": "navigate", | |
"sec-fetch-site": "none", | |
"sec-fetch-user": "?1", | |
"upgrade-insecure-requests": "1", | |
}, | |
referrerPolicy: "strict-origin-when-cross-origin", | |
body: null, | |
method: "GET", | |
mode: "cors", | |
credentials: "include", | |
}); | |
data = await response.json(); | |
pageSize = data[VIDEO_COUNT_KEY]; // got the size | |
url = `https://www.udemy.com/api-2.0/courses/${courseNumber}/subscriber-curriculum-items/?page=${page}&page_size=${pageSize}&fields[lecture]=title,sort_order,asset&fields[asset]=title,filename,asset_type,time_estimation,captions`; | |
response = await fetch(url, { | |
headers: { | |
accept: | |
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", | |
"accept-language": "en-US,en;q=0.9", | |
"cache-control": "no-cache", | |
pragma: "no-cache", | |
"sec-ch-ua": | |
'"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"', | |
"sec-ch-ua-mobile": "?0", | |
"sec-ch-ua-platform": '"Linux"', | |
"sec-fetch-dest": "document", | |
"sec-fetch-mode": "navigate", | |
"sec-fetch-site": "none", | |
"sec-fetch-user": "?1", | |
"upgrade-insecure-requests": "1", | |
}, | |
referrerPolicy: "strict-origin-when-cross-origin", | |
body: null, | |
method: "GET", | |
mode: "cors", | |
credentials: "include", | |
}); | |
data = await response.json(); | |
const captionLinks = data.results | |
.map((item) => | |
item._class === "lecture" | |
? item?.asset?.captions?.find( | |
(captionItem) => captionItem.locale_id === languageCode | |
)?.url | |
: null | |
) | |
.filter((_) => _); | |
return captionLinks; | |
} | |
// handle | |
async function openTranscriptFromUrl(captionUrl) { | |
const link = document.createElement("a"); | |
link.href = captionUrl; | |
link.download = new Date().toLocaleTimeString().replace(" ", "-"); | |
link.click(); | |
document.removeChild(link); | |
} | |
async function copyToClipboard(text, open = false) { | |
try { | |
await navigator.clipboard.writeText(text); | |
console.log("Async: Copying to clipboard was successful!"); | |
} catch (err) { | |
console.error("Async: Could not copy text: ", err); | |
console.log("Printing instead"); | |
console.log(text); | |
} | |
} | |
(async () => { | |
// single caption | |
const url = await getOneCaptionLink( | |
prompt("Course number"), | |
prompt("Lecture number") | |
); // put the numbers here | |
await copyToClipboard(url); // OR | |
await openTranscriptFromUrl(url); | |
})(); | |
(async () => { | |
// multiple captions | |
const urls = await getAllCaptionsLinks(); | |
console.log(urls); | |
})(); | |
// downloaded to local server | |
{ | |
// import myJson from "./nodejs-complete-guide.json" assert { type: "json" }; | |
// const pages = myJson.pages; | |
// console.log( | |
// pages | |
// .map((item) => item.results) | |
// .flat(Infinity) | |
// .filter((item) => item.asset.asset_type === "Video") | |
// .map((item) => item.id) | |
// ); | |
// import myJson from "./nodejs-complete-guide.json" assert { type: "json" }; | |
// const { lectureIds, courseNumber } = myJson; | |
async function getJSON() { | |
const x = await fetch("http://127.0.0.1:5500/nodejs-complete-guide.json"); | |
const y = await x.json(); | |
return y; | |
} | |
async function getVTTAsText(url) { | |
const resp = await fetch(url); // fetch text file | |
const data = await resp.text(); | |
return data; | |
} | |
async function saveLocally(data, url = "http://localhost:3000") { | |
return fetch(url, { | |
method: "POST", | |
headers: { | |
"Content-Type": "application/json", | |
}, | |
body: JSON.stringify(data), | |
}) | |
.then((response) => response.json()) | |
.then((data) => console.log(data)) | |
.catch((error) => console.error(error)); | |
} | |
async function main(LIMIT = 5) { | |
const { lectureIds, courseNumber } = await getJSON(); | |
// const url = await getOneCaptionLink(courseNumber, lectureIds[0]); | |
// console.log(url); | |
// const ts = await getVTTAsText(url); | |
// console.log(ts); | |
const urls = await Promise.all( | |
lectureIds | |
.filter((_, i) => i < LIMIT) | |
.map((id) => getOneCaptionLink(courseNumber, id)) | |
); | |
const tss = await Promise.all( | |
urls.map((url) => (url ? getVTTAsText(url) : "")) | |
); | |
const sumLength = tss.reduce((accum, ts) => (accum += ts?.length ?? 0), 0); | |
console.log(urls, sumLength, tss); | |
// localStorage.setItem("tsAll", JSON.stringify(tss)); | |
Promise.all( | |
lectureIds | |
.filter((_, i) => i < LIMIT) | |
.map((id, idx) => ({ | |
name: id, | |
value: tss[idx], | |
})) | |
.filter(({ value }) => value.startsWith("WEB")) | |
.map(({ name, value }) => | |
saveLocally( | |
{ | |
name, | |
value, | |
}, | |
"http://localhost:3000/save-all" | |
) | |
) | |
); | |
} | |
// main(10); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// meant to run in the terminal | |
// Converts a given VTT file into plain text (with paragraphs) | |
// respects time between words, and adds spacing | |
const fs = require("fs"); | |
const WebVTT = require("node-webvtt"); | |
const wrap = require("fast-word-wrap"); | |
function extractTextWithoutTimestamps(captions) { | |
let plainText = ""; | |
for (const caption of captions) { | |
plainText += caption.text + " "; | |
} | |
return plainText; | |
} | |
function groupSentencesIntoParagraphs(text, threshold = 3) { | |
const paragraphs = []; | |
let currentParagraph = ""; | |
const sentences = text.split(/(?<=[.!?]) +/); | |
for (const sentence of sentences) { | |
currentParagraph += sentence + " "; | |
if (sentence.match(/[.!?]$/)) { | |
paragraphs.push(currentParagraph); | |
currentParagraph = ""; | |
} | |
} | |
return paragraphs; | |
} | |
function processVTTFile(filePath) { | |
const vttData = fs.readFileSync(filePath, "utf-8"); | |
const vttSubtitles = WebVTT.parse(vttData).cues; | |
const subtitleText = extractTextWithoutTimestamps(vttSubtitles); | |
const paragraphs = groupSentencesIntoParagraphs(subtitleText); | |
return paragraphs; | |
} | |
// trial - | |
// const vttFilePath = "your_subtitle_file.vtt"; | |
// const paragraphs = processVTTFile(vttFilePath); | |
// paragraphs.forEach((paragraph, idx) => { | |
// console.log(`Paragraph ${idx + 1}:\n${paragraph}\n`); | |
// }); | |
// place VTT files in current directory, and run this function | |
// .txt files will be generated | |
function createPlainTextFromVTT(dir = "./", dest = "") { | |
dest ??= dir; | |
fs.readdir(dir, (err, files) => { | |
// console.log(files, files.length); | |
files | |
.filter((file) => file.toLowerCase().endsWith(".vtt")) | |
.forEach((file) => { | |
const paragraphs = processVTTFile(`${dir}/${file}`); | |
const fileContent = paragraphs.join("\n" + "\n"); | |
// wrap text | |
// const formattedFileContent = wrap(fileContent, 80); | |
const cpl = 80; | |
partials = []; | |
for (p of fileContent.split("\n").filter((s) => s)) { | |
partials.push(wrap(p, cpl)); | |
} | |
const formattedFileContent = partials.join("\n"); | |
fs.writeFile( | |
`${dest}/${file}`.replace(".vtt", ".txt"), | |
formattedFileContent, | |
function (err) { | |
if (err) throw err; | |
// console.log("Saved!"); | |
} | |
); | |
}); | |
}); | |
} | |
// createPlainTextFromVTT(); | |
module.exports = { processVTTFile, createPlainTextFromVTT }; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require("fs"); | |
const myJSON = require("./nodejs-complete-guide-all.json"); | |
const { lectureIds, sectionTitles, pages } = myJSON; | |
let lectureObjects = pages | |
.map((item) => item.results) | |
.flat(Infinity) | |
.filter((item) => ["Video", "Article"].includes(item.asset.asset_type)); | |
// exclude 'Upcoming' is ignored | |
// console.log(lectureObjects[0], lectureIds.length); | |
// fs.readdir("./txt/", (err, files) => console.log(files)); | |
let sectionNumber = 0; | |
lectureObjects | |
.filter((_, i) => i < Infinity) | |
.forEach((item, index) => { | |
const isVideo = item.asset.asset_type === "Video"; | |
if (!isVideo) return; | |
const { id = "", title = "", title_cleaned = "", description = "" } = item; | |
const lectureNumber = index + 1; | |
const lectureHead = [ | |
`## ${lectureNumber}. ${title}`, | |
``, | |
`<strong><em>${description || "no description"}</em></strong>`, | |
``, | |
].join("\n"); | |
const fileName = `${lectureNumber}_${title_cleaned}.md`; | |
const lectureFooter = [`---`].join("\n"); | |
if ( | |
["Module Introduction", "Introduction", "Course Roundup"].includes(title) | |
) { | |
sectionNumber++; | |
} | |
const sectionTitle = sectionTitles[sectionNumber - 1]; | |
const sectionFolderName = sectionTitle | |
?.replaceAll(".", "") | |
.replaceAll(":", "") | |
.replaceAll("Section ", "") | |
.replaceAll(",", "_") | |
.replaceAll(" ", "_") | |
.replaceAll("__", "_") | |
.replaceAll("(", "") | |
.replaceAll(")", ""); | |
// console.log(`./output/${sectionFolderName}/${fileName}`); | |
fs.readFile(`./txt/${id}.txt`, "utf8", (err, text) => { | |
const fileContent = [lectureHead, text, lectureFooter].join("\n"); | |
// if (text) console.log(fileContent); | |
// return; | |
fs.mkdir(`./output/${sectionFolderName}`, { recursive: true }, (err) => { | |
if (err) throw err; | |
fs.writeFile( | |
`./output/${sectionFolderName}/${fileName}`, | |
fileContent, | |
function (err) { | |
if (err) throw err; | |
// console.log("Saved!"); | |
// res.send(req.body ? "Ok" : "Not Ok"); | |
} | |
); | |
}); | |
}); | |
return lectureHead; | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Next step is to process the .vtt files and create plain text out of them. By plain text I mean "creating paragraphs" based on timestamps. See https://pypi.org/project/vttformatter/Didn't work.Wrote a naive vtt to plain text paragraphs converter with the help of ChatGPTdone