Last active
July 12, 2024 14:52
-
-
Save signalwerk/8262a30a3ac36efabd5d6f4d075790ab to your computer and use it in GitHub Desktop.
A tool to extract content from HTML, convert it to YAML, and update HTML content from YAML files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// HTML to YAML Extractor and Updater | |
/** | |
* A tool to extract content from HTML, convert it to YAML, and update HTML content from YAML files. | |
*/ | |
import cheerio from "cheerio"; | |
import { promises as fs } from "fs"; | |
import path from "path"; | |
import yaml from "js-yaml"; | |
import TurndownService from "turndown"; | |
import { remark } from "remark"; | |
import remarkHtml from "remark-html"; | |
import prettier from "prettier"; | |
const turndownService = new TurndownService(); | |
const yamlDir = "./yamls"; | |
let textDivsSelect = ".colMain > .wrap > .contentItem"; | |
// This function removes old IE conditional comments from the HTML. | |
function cleanHTML(html) { | |
let cleanStr = html; | |
cleanStr = cleanStr.replace(/<!--\[if[\s\S]*?-->/g, ""); | |
cleanStr = cleanStr.replace(/<!--([^>]+?)endif\]-->/g, ""); | |
return cleanStr; | |
} | |
async function traverseDir(dir, callback, root = dir) { | |
const files = await fs.readdir(dir); | |
for (let file of files) { | |
let fullPath = path.join(dir, file); | |
let stat = await fs.lstat(fullPath); | |
if (stat.isDirectory()) { | |
await traverseDir(fullPath, callback, root); | |
} else if (path.extname(file) === ".html") { | |
await callback(fullPath, dir, root); | |
} | |
} | |
} | |
// This function extracts YAML from the HTML. | |
async function extractYamlFromHtml(filePath, doc, root) { | |
let html = await fs.readFile(filePath, "utf-8"); | |
html = cleanHTML(html); | |
let $ = cheerio.load(html); | |
let textDivs = $(textDivsSelect).filter((index, element) => { | |
const classes = $(element).attr("class"); | |
if (classes) { | |
const classArray = classes.split(" "); | |
for (let i = 0; i < classArray.length; i++) { | |
if (classArray[i].startsWith("wrapsfcustom-")) { | |
return false; // Element has a class starting with "wrapsfcustom-", so exclude it | |
} | |
} | |
} | |
return true; // Element doesn't have a class starting with "wrapsfcustom-" | |
}); | |
textDivs.each(async function (i, el) { | |
let textContent = $(el).html(); | |
textContent = textContent.replaceAll(" ", "zzzzz--nbsp--zzzzz"); | |
let markdown = turndownService.turndown(textContent); | |
markdown = markdown.replaceAll("zzzzz--nbsp--zzzzz", " "); | |
let prettyMarkdown = prettier.format(markdown, { parser: "markdown" }); | |
let yamlContent = yaml.dump({ content: prettyMarkdown }); | |
let relativePath = path.relative(root, filePath); | |
// remove file extension | |
relativePath = relativePath.slice(0, -5); | |
let fullIdentifier = `${relativePath.replaceAll(path.sep, "-")}-${i}`; | |
let yamlPath = path.join(yamlDir, `${fullIdentifier}.yaml`); | |
yamlContent = prettier.format(yamlContent, { parser: "yaml" }); | |
$(el).attr("data-yaml", fullIdentifier); | |
await fs.writeFile(yamlPath, yamlContent, "utf-8"); | |
console.log(`Wrote "${yamlPath}"`); | |
}); | |
let formattedHtml = prettier.format($.html(), { parser: "html" }); | |
await fs.writeFile(filePath, formattedHtml, "utf-8"); | |
} | |
// This function updates the HTML from the YAML files. | |
async function updateHtmlFromYaml(filePath, doc, root) { | |
let html = await fs.readFile(filePath, "utf-8"); | |
let $ = cheerio.load(html); | |
let textDivs = $("[data-yaml]"); | |
let updatePromises = textDivs | |
.map(async (i, el) => { | |
let yamlPath = $(el).attr("data-yaml"); | |
yamlPath = path.join(yamlDir, `${yamlPath.trim()}.yaml`); | |
let yamlContent = await fs.readFile(yamlPath, "utf-8"); | |
let data = yaml.load(yamlContent); | |
let convertedHtml = await remark().use(remarkHtml).process(data.content); | |
convertedHtml = convertedHtml.toString(); | |
console.log(`set from "${yamlPath}"`); | |
$(el).html(convertedHtml); | |
}) | |
.get(); | |
await Promise.all(updatePromises); | |
let formattedHtml = prettier.format($.html(), { parser: "html" }); | |
await fs.writeFile(filePath, formattedHtml, "utf-8"); | |
} | |
if (process.argv.includes("--extract")) { | |
let dirIndex = process.argv.indexOf("--extract") + 1; | |
let dir = process.argv[dirIndex]; | |
traverseDir(dir, extractYamlFromHtml); | |
} else if (process.argv.includes("--update")) { | |
let dirIndex = process.argv.indexOf("--update") + 1; | |
let dir = process.argv[dirIndex]; | |
traverseDir(dir, updateHtmlFromYaml); | |
} else { | |
console.error( | |
"Please specify a command: --extract <directory> or --update <directory>", | |
); | |
process.exit(1); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment