Last active
January 11, 2022 02:59
-
-
Save jasenmichael/ebfa5f1c411a9f3fd735d2bb342fb056 to your computer and use it in GitHub Desktop.
recursivly convert html to markdown using nodejs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require("fs"); | |
const path = require("path"); | |
const glob = require("glob"); | |
const TurndownService = require("turndown"); | |
const turndownService = new TurndownService({ | |
// options | |
headingStyle: "atx", | |
bulletListMarker: "-", | |
linkStyle: "referenced", | |
linkReferenceStyle: "full", | |
}); | |
glob("site/**/*.html", (err, files) => { | |
if (err) { | |
console.log(err); | |
} | |
if (files) { | |
files.forEach((file) => { | |
if (!fs.existsSync('out/' + path.dirname(file))) { | |
fs.mkdirSync('out/' + path.dirname(file), { recursive: true }) | |
} | |
console.log(file); | |
const html = fs.readFileSync(file).toString(); | |
const textArray = turndownService.turndown(html).split("\n"); | |
const text = `${file.replace('site/', 'http://')}\r\n` + textArray | |
.map((line) => { | |
line = line.endsWith('.html') ? line.replace('.html', '') : line | |
return line.includes('turn\\_client\\_track\\_id = "";', "") ? line.split('turn\\_client\\_track\\_id = "";')[1] : line | |
}) | |
.filter((line) => { | |
// console.log(line); | |
return ( | |
line !== "\r\n" && | |
line !== " " && | |
line.trim() !== "- Search" && | |
line.trim() !== "Search" && | |
line.trim() !== "- Search" && | |
line.trim() !== "- Previous" && | |
line.trim() !== "- |" && | |
line.trim() !== "- Next" && | |
!/\[\]\[(.*?)\]/g.test(line) && | |
!line.startsWith("window.dataLayer") && | |
!line.startsWith("!function") && | |
!line.startsWith("![](") && | |
!line.startsWith("function ") && | |
!line.trim().startsWith("jQuery(function()") && | |
!line.startsWith("<img ") && | |
!line.startsWith("### _!") && | |
!line.startsWith("// ") | |
); | |
}) | |
.join("\r\n") | |
.replace(/ \r\n /g, " ") | |
.replace(/]: /g, "]: https://www.goodwillcentraltexas.org/") | |
.replace(/- - /g, "- ") | |
.replace(/ - /g, " - ") | |
.replace(/- /g, "- ") | |
// .replace(/ /g, "") | |
// .replace(/ /g, "") | |
// .replace(/ /g, "") | |
.replace(/https:\/\/www.goodwillcentraltexas.org\/http/g, "http") | |
.replace( | |
"\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n", | |
"\r\n" | |
) | |
.replace(/\r\n\r\n\r\n/g, "\r\n\r\n") | |
.replace(/\r\n\r\n/g, "\r\n\r\n"); | |
// fs.writeFileSync("out.json", JSON.stringify(text, null, 2)); | |
fs.writeFileSync(`./out/${file.replace(".html", ".md")}`, text); | |
}); | |
} | |
console.log(`complete! processed: ${files.length} files`); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment