Skip to content

Instantly share code, notes, and snippets.

@sanjarcode
Last active August 30, 2023 05:58
Show Gist options
  • Save sanjarcode/0ecc4fc4cb281e1b061d0edf27c89b4c to your computer and use it in GitHub Desktop.
Save sanjarcode/0ecc4fc4cb281e1b061d0edf27c89b4c to your computer and use it in GitHub Desktop.
Download Udemy caption links, other Udemy setup
/*
Local server (assumes Node.js is installed)
Purpose - the console code sends data to this (local) server, so
the transcript files can be saved locally.
Add this file to a folder, then run the following command in the folder
`npm init -y && npm install express body-parser cors && node app.js`
*/
const express = require("express");
const bodyParser = require("body-parser");
const cors = require("cors");
const app = express();
app.use(bodyParser.urlencoded({ extended: false }));
app.use(express.json());
app.use(cors());
app.get("/", (req, res, next) => {
res.json("Save all server ready");
});
app.post("/save-all", (req, res, next) => {
const fs = require("fs");
const dir = "./vtt";
fs.readdir(dir, (err, files) => {
console.log(files.length, req.body);
if (!req.body) {
res.send(req.body ? "Ok" : "Not Ok");
return;
}
fs.writeFile(`./vtt/${req.body.name}.vtt`, req.body.value, function (err) {
if (err) throw err;
console.log("Saved!");
res.send(req.body ? "Ok" : "Not Ok");
});
});
});
app.use((req, res, next) => {
res
.status(404)
.render("404", { myActivePath: "404-page", docTitle: "Page Not Found" });
next();
});
app.listen(3000, () => console.log("Save-all server running on port 3000"));
// get caption URL (s)
async function getOneCaptionLink(
courseNumber = "1879018",
lectureNumber = "11738946",
languageCode = "en_US",
url = "https://www.udemy.com/api-2.0/users/me/subscribed-courses",
queryParams = "fields[lecture]=asset,description,download_url,is_free,last_watched_second&fields[asset]=asset_type,length,media_license_token,course_is_drmed,media_sources,captions,thumbnail_sprite,slides,slide_urls,download_urls,external_url"
) {
url ||= "https://www.udemy.com/api-2.0/users/me/subscribed-courses";
queryParams ||=
"fields[lecture]=asset,description,download_url,is_free,last_watched_second&fields[asset]=asset_type,length,media_license_token,course_is_drmed,media_sources,captions,thumbnail_sprite,slides,slide_urls,download_urls,external_url";
const response = await fetch(
`${url}/${courseNumber}/lectures/${lectureNumber}?${queryParams}`,
{
headers: {
accept: "application/json, text/plain, */*",
"accept-language": "en-GB",
"cache-control": "no-cache",
pragma: "no-cache",
"sec-ch-ua":
'"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"x-requested-with": "XMLHttpRequest",
"x-udemy-cache-brand": "INen_US",
"x-udemy-cache-campaign-code": "ST6OT32123",
"x-udemy-cache-device": "None",
"x-udemy-cache-language": "en",
"x-udemy-cache-logged-in": "1",
"x-udemy-cache-marketplace-country": "IN",
"x-udemy-cache-price-country": "IN",
"x-udemy-cache-release": "f3e5ec9c20da099eb2d2",
"x-udemy-cache-user": "56648366",
"x-udemy-cache-version": "1",
},
referrer: `https://www.udemy.com/course/nodejs-the-complete-guide/learn/lecture/${lectureNumber}`,
referrerPolicy: "strict-origin-when-cross-origin",
body: null,
method: "GET",
mode: "cors",
credentials: "include",
}
);
const ASSET_KEY = "asset";
const CAPTIONS_KEY = "captions";
const LOCALE_KEY = "locale_id";
const CAPTION_URL_KEY = "url";
const data = await response.json();
/*
{
_class: "lecture",
id: 11738946,
description: "",
is_free: false,
asset: {},
url: "/course/nodejs-the-complete-guide/learn/lecture/11738946",
last_watched_second: 15,
download_url: "/api-2.0/users/me/subscribed-courses/1879018/lectures/11738946/"
}
*/
const captions_for_all_langs = data[ASSET_KEY][CAPTIONS_KEY];
const captionUrl = captions_for_all_langs.find(
(item) => item[LOCALE_KEY].toLowerCase() === languageCode.toLowerCase()
)?.[CAPTION_URL_KEY];
return captionUrl;
}
async function getAllCaptionsLinks(
courseNumber = "1879018",
languageCode = "en_US",
page = 1
) {
const VIDEO_COUNT_KEY = "count";
let pageSize = 1;
let url = `https://www.udemy.com/api-2.0/courses/${courseNumber}/subscriber-curriculum-items/?page=${page}&page_size=${pageSize}&fields[lecture]=title,sort_order,asset&fields[asset]=title,filename,asset_type,time_estimation,captions`;
let response = null,
data = null;
response = await fetch(url, {
headers: {
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-US,en;q=0.9",
"cache-control": "no-cache",
pragma: "no-cache",
"sec-ch-ua":
'"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
},
referrerPolicy: "strict-origin-when-cross-origin",
body: null,
method: "GET",
mode: "cors",
credentials: "include",
});
data = await response.json();
pageSize = data[VIDEO_COUNT_KEY]; // got the size
url = `https://www.udemy.com/api-2.0/courses/${courseNumber}/subscriber-curriculum-items/?page=${page}&page_size=${pageSize}&fields[lecture]=title,sort_order,asset&fields[asset]=title,filename,asset_type,time_estimation,captions`;
response = await fetch(url, {
headers: {
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-US,en;q=0.9",
"cache-control": "no-cache",
pragma: "no-cache",
"sec-ch-ua":
'"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
},
referrerPolicy: "strict-origin-when-cross-origin",
body: null,
method: "GET",
mode: "cors",
credentials: "include",
});
data = await response.json();
const captionLinks = data.results
.map((item) =>
item._class === "lecture"
? item?.asset?.captions?.find(
(captionItem) => captionItem.locale_id === languageCode
)?.url
: null
)
.filter((_) => _);
return captionLinks;
}
// handle
async function openTranscriptFromUrl(captionUrl) {
const link = document.createElement("a");
link.href = captionUrl;
link.download = new Date().toLocaleTimeString().replace(" ", "-");
link.click();
document.removeChild(link);
}
async function copyToClipboard(text, open = false) {
try {
await navigator.clipboard.writeText(text);
console.log("Async: Copying to clipboard was successful!");
} catch (err) {
console.error("Async: Could not copy text: ", err);
console.log("Printing instead");
console.log(text);
}
}
(async () => {
// single caption
const url = await getOneCaptionLink(
prompt("Course number"),
prompt("Lecture number")
); // put the numbers here
await copyToClipboard(url); // OR
await openTranscriptFromUrl(url);
})();
(async () => {
// multiple captions
const urls = await getAllCaptionsLinks();
console.log(urls);
})();
// downloaded to local server
{
// import myJson from "./nodejs-complete-guide.json" assert { type: "json" };
// const pages = myJson.pages;
// console.log(
// pages
// .map((item) => item.results)
// .flat(Infinity)
// .filter((item) => item.asset.asset_type === "Video")
// .map((item) => item.id)
// );
// import myJson from "./nodejs-complete-guide.json" assert { type: "json" };
// const { lectureIds, courseNumber } = myJson;
async function getJSON() {
const x = await fetch("http://127.0.0.1:5500/nodejs-complete-guide.json");
const y = await x.json();
return y;
}
async function getVTTAsText(url) {
const resp = await fetch(url); // fetch text file
const data = await resp.text();
return data;
}
async function saveLocally(data, url = "http://localhost:3000") {
return fetch(url, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify(data),
})
.then((response) => response.json())
.then((data) => console.log(data))
.catch((error) => console.error(error));
}
async function main(LIMIT = 5) {
const { lectureIds, courseNumber } = await getJSON();
// const url = await getOneCaptionLink(courseNumber, lectureIds[0]);
// console.log(url);
// const ts = await getVTTAsText(url);
// console.log(ts);
const urls = await Promise.all(
lectureIds
.filter((_, i) => i < LIMIT)
.map((id) => getOneCaptionLink(courseNumber, id))
);
const tss = await Promise.all(
urls.map((url) => (url ? getVTTAsText(url) : ""))
);
const sumLength = tss.reduce((accum, ts) => (accum += ts?.length ?? 0), 0);
console.log(urls, sumLength, tss);
// localStorage.setItem("tsAll", JSON.stringify(tss));
Promise.all(
lectureIds
.filter((_, i) => i < LIMIT)
.map((id, idx) => ({
name: id,
value: tss[idx],
}))
.filter(({ value }) => value.startsWith("WEB"))
.map(({ name, value }) =>
saveLocally(
{
name,
value,
},
"http://localhost:3000/save-all"
)
)
);
}
// main(10);
}
// meant to run in the terminal
// Converts a given VTT file into plain text (with paragraphs)
// respects time between words, and adds spacing
const fs = require("fs");
const WebVTT = require("node-webvtt");
const wrap = require("fast-word-wrap");
function extractTextWithoutTimestamps(captions) {
let plainText = "";
for (const caption of captions) {
plainText += caption.text + " ";
}
return plainText;
}
function groupSentencesIntoParagraphs(text, threshold = 3) {
const paragraphs = [];
let currentParagraph = "";
const sentences = text.split(/(?<=[.!?]) +/);
for (const sentence of sentences) {
currentParagraph += sentence + " ";
if (sentence.match(/[.!?]$/)) {
paragraphs.push(currentParagraph);
currentParagraph = "";
}
}
return paragraphs;
}
function processVTTFile(filePath) {
const vttData = fs.readFileSync(filePath, "utf-8");
const vttSubtitles = WebVTT.parse(vttData).cues;
const subtitleText = extractTextWithoutTimestamps(vttSubtitles);
const paragraphs = groupSentencesIntoParagraphs(subtitleText);
return paragraphs;
}
// trial -
// const vttFilePath = "your_subtitle_file.vtt";
// const paragraphs = processVTTFile(vttFilePath);
// paragraphs.forEach((paragraph, idx) => {
// console.log(`Paragraph ${idx + 1}:\n${paragraph}\n`);
// });
// place VTT files in current directory, and run this function
// .txt files will be generated
function createPlainTextFromVTT(dir = "./", dest = "") {
dest ??= dir;
fs.readdir(dir, (err, files) => {
// console.log(files, files.length);
files
.filter((file) => file.toLowerCase().endsWith(".vtt"))
.forEach((file) => {
const paragraphs = processVTTFile(`${dir}/${file}`);
const fileContent = paragraphs.join("\n" + "\n");
// wrap text
// const formattedFileContent = wrap(fileContent, 80);
const cpl = 80;
partials = [];
for (p of fileContent.split("\n").filter((s) => s)) {
partials.push(wrap(p, cpl));
}
const formattedFileContent = partials.join("\n");
fs.writeFile(
`${dest}/${file}`.replace(".vtt", ".txt"),
formattedFileContent,
function (err) {
if (err) throw err;
// console.log("Saved!");
}
);
});
});
}
// createPlainTextFromVTT();
module.exports = { processVTTFile, createPlainTextFromVTT };
const fs = require("fs");
const myJSON = require("./nodejs-complete-guide-all.json");
const { lectureIds, sectionTitles, pages } = myJSON;
let lectureObjects = pages
.map((item) => item.results)
.flat(Infinity)
.filter((item) => ["Video", "Article"].includes(item.asset.asset_type));
// exclude 'Upcoming' is ignored
// console.log(lectureObjects[0], lectureIds.length);
// fs.readdir("./txt/", (err, files) => console.log(files));
let sectionNumber = 0;
lectureObjects
.filter((_, i) => i < Infinity)
.forEach((item, index) => {
const isVideo = item.asset.asset_type === "Video";
if (!isVideo) return;
const { id = "", title = "", title_cleaned = "", description = "" } = item;
const lectureNumber = index + 1;
const lectureHead = [
`## ${lectureNumber}. ${title}`,
``,
`<strong><em>${description || "no description"}</em></strong>`,
``,
].join("\n");
const fileName = `${lectureNumber}_${title_cleaned}.md`;
const lectureFooter = [`---`].join("\n");
if (
["Module Introduction", "Introduction", "Course Roundup"].includes(title)
) {
sectionNumber++;
}
const sectionTitle = sectionTitles[sectionNumber - 1];
const sectionFolderName = sectionTitle
?.replaceAll(".", "")
.replaceAll(":", "")
.replaceAll("Section ", "")
.replaceAll(",", "_")
.replaceAll(" ", "_")
.replaceAll("__", "_")
.replaceAll("(", "")
.replaceAll(")", "");
// console.log(`./output/${sectionFolderName}/${fileName}`);
fs.readFile(`./txt/${id}.txt`, "utf8", (err, text) => {
const fileContent = [lectureHead, text, lectureFooter].join("\n");
// if (text) console.log(fileContent);
// return;
fs.mkdir(`./output/${sectionFolderName}`, { recursive: true }, (err) => {
if (err) throw err;
fs.writeFile(
`./output/${sectionFolderName}/${fileName}`,
fileContent,
function (err) {
if (err) throw err;
// console.log("Saved!");
// res.send(req.body ? "Ok" : "Not Ok");
}
);
});
});
return lectureHead;
});
@sanjarcode
Copy link
Author

sanjarcode commented Aug 7, 2023

Next step is to process the .vtt files and create plain text out of them. By plain text I mean "creating paragraphs" based on timestamps. See https://pypi.org/project/vttformatter/

Didn't work. Wrote a naive vtt to plain text paragraphs converter with the help of ChatGPT

done

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment