Last active
August 30, 2023 05:58
-
-
Save sanjarcode/0ecc4fc4cb281e1b061d0edf27c89b4c to your computer and use it in GitHub Desktop.
Download Udemy caption links, other Udemy setup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Local server (assumes Node.js is installed) | |
Purpose - the console code sends data to this (local) server, so | |
the transcript files can be saved locally. | |
Add this file to a folder, then run the following command in the folder | |
`npm init -y && npm install express body-parser cors && node app.js` | |
*/ | |
const express = require("express"); | |
const bodyParser = require("body-parser"); | |
const cors = require("cors"); | |
const app = express(); | |
app.use(bodyParser.urlencoded({ extended: false })); | |
app.use(express.json()); | |
app.use(cors()); | |
app.get("/", (req, res, next) => { | |
res.json("Save all server ready"); | |
}); | |
app.post("/save-all", (req, res, next) => { | |
const fs = require("fs"); | |
const dir = "./vtt"; | |
fs.readdir(dir, (err, files) => { | |
console.log(files.length, req.body); | |
if (!req.body) { | |
res.send(req.body ? "Ok" : "Not Ok"); | |
return; | |
} | |
fs.writeFile(`./vtt/${req.body.name}.vtt`, req.body.value, function (err) { | |
if (err) throw err; | |
console.log("Saved!"); | |
res.send(req.body ? "Ok" : "Not Ok"); | |
}); | |
}); | |
}); | |
app.use((req, res, next) => { | |
res | |
.status(404) | |
.render("404", { myActivePath: "404-page", docTitle: "Page Not Found" }); | |
next(); | |
}); | |
app.listen(3000, () => console.log("Save-all server running on port 3000")); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// meant to run in the terminal | |
// Converts a given VTT file into plain text (with paragraphs) | |
// respects time between words, and adds spacing | |
const fs = require("fs"); | |
const WebVTT = require("node-webvtt"); | |
const wrap = require("fast-word-wrap"); | |
function extractTextWithoutTimestamps(captions) { | |
let plainText = ""; | |
for (const caption of captions) { | |
plainText += caption.text + " "; | |
} | |
return plainText; | |
} | |
function groupSentencesIntoParagraphs(text, threshold = 3) { | |
const paragraphs = []; | |
let currentParagraph = ""; | |
const sentences = text.split(/(?<=[.!?]) +/); | |
for (const sentence of sentences) { | |
currentParagraph += sentence + " "; | |
if (sentence.match(/[.!?]$/)) { | |
paragraphs.push(currentParagraph); | |
currentParagraph = ""; | |
} | |
} | |
return paragraphs; | |
} | |
function processVTTFile(filePath) { | |
const vttData = fs.readFileSync(filePath, "utf-8"); | |
const vttSubtitles = WebVTT.parse(vttData).cues; | |
const subtitleText = extractTextWithoutTimestamps(vttSubtitles); | |
const paragraphs = groupSentencesIntoParagraphs(subtitleText); | |
return paragraphs; | |
} | |
// trial - | |
// const vttFilePath = "your_subtitle_file.vtt"; | |
// const paragraphs = processVTTFile(vttFilePath); | |
// paragraphs.forEach((paragraph, idx) => { | |
// console.log(`Paragraph ${idx + 1}:\n${paragraph}\n`); | |
// }); | |
// place VTT files in current directory, and run this function | |
// .txt files will be generated | |
function createPlainTextFromVTT(dir = "./", dest = "") { | |
dest ??= dir; | |
fs.readdir(dir, (err, files) => { | |
// console.log(files, files.length); | |
files | |
.filter((file) => file.toLowerCase().endsWith(".vtt")) | |
.forEach((file) => { | |
const paragraphs = processVTTFile(`${dir}/${file}`); | |
const fileContent = paragraphs.join("\n" + "\n"); | |
// wrap text | |
// const formattedFileContent = wrap(fileContent, 80); | |
const cpl = 80; | |
partials = []; | |
for (p of fileContent.split("\n").filter((s) => s)) { | |
partials.push(wrap(p, cpl)); | |
} | |
const formattedFileContent = partials.join("\n"); | |
fs.writeFile( | |
`${dest}/${file}`.replace(".vtt", ".txt"), | |
formattedFileContent, | |
function (err) { | |
if (err) throw err; | |
// console.log("Saved!"); | |
} | |
); | |
}); | |
}); | |
} | |
// createPlainTextFromVTT(); | |
module.exports = { processVTTFile, createPlainTextFromVTT }; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require("fs"); | |
const myJSON = require("./nodejs-complete-guide-all.json"); | |
const { lectureIds, sectionTitles, pages } = myJSON; | |
let lectureObjects = pages | |
.map((item) => item.results) | |
.flat(Infinity) | |
.filter((item) => ["Video", "Article"].includes(item.asset.asset_type)); | |
// exclude 'Upcoming' is ignored | |
// console.log(lectureObjects[0], lectureIds.length); | |
// fs.readdir("./txt/", (err, files) => console.log(files)); | |
let sectionNumber = 0; | |
lectureObjects | |
.filter((_, i) => i < Infinity) | |
.forEach((item, index) => { | |
const isVideo = item.asset.asset_type === "Video"; | |
if (!isVideo) return; | |
const { id = "", title = "", title_cleaned = "", description = "" } = item; | |
const lectureNumber = index + 1; | |
const lectureHead = [ | |
`## ${lectureNumber}. ${title}`, | |
``, | |
`<strong><em>${description || "no description"}</em></strong>`, | |
``, | |
].join("\n"); | |
const fileName = `${lectureNumber}_${title_cleaned}.md`; | |
const lectureFooter = [`---`].join("\n"); | |
if ( | |
["Module Introduction", "Introduction", "Course Roundup"].includes(title) | |
) { | |
sectionNumber++; | |
} | |
const sectionTitle = sectionTitles[sectionNumber - 1]; | |
const sectionFolderName = sectionTitle | |
?.replaceAll(".", "") | |
.replaceAll(":", "") | |
.replaceAll("Section ", "") | |
.replaceAll(",", "_") | |
.replaceAll(" ", "_") | |
.replaceAll("__", "_") | |
.replaceAll("(", "") | |
.replaceAll(")", ""); | |
// console.log(`./output/${sectionFolderName}/${fileName}`); | |
fs.readFile(`./txt/${id}.txt`, "utf8", (err, text) => { | |
const fileContent = [lectureHead, text, lectureFooter].join("\n"); | |
// if (text) console.log(fileContent); | |
// return; | |
fs.mkdir(`./output/${sectionFolderName}`, { recursive: true }, (err) => { | |
if (err) throw err; | |
fs.writeFile( | |
`./output/${sectionFolderName}/${fileName}`, | |
fileContent, | |
function (err) { | |
if (err) throw err; | |
// console.log("Saved!"); | |
// res.send(req.body ? "Ok" : "Not Ok"); | |
} | |
); | |
}); | |
}); | |
return lectureHead; | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Next step is to process the .vtt files and create plain text out of them. By plain text I mean "creating paragraphs" based on timestamps. See https://pypi.org/project/vttformatter/Didn't work.Wrote a naive vtt to plain text paragraphs converter with the help of ChatGPTdone