Created
December 17, 2024 12:07
-
-
Save mindplay-dk/110df1573ae1e32f5c219575355a2998 to your computer and use it in GitHub Desktop.
a horrible script to convert the entire PHP manual from HTML files to Markdown
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs'); | |
const path = require('path'); | |
const TurndownService = require('turndown'); | |
// Directories | |
const inputDir = path.join(__dirname, 'html'); | |
const outputDir = path.join(__dirname, 'markdown'); | |
// Initialize Turndown | |
const turndownService = new TurndownService({ | |
codeBlockStyle: 'fenced', | |
}); | |
const replaceWithFencedCodeBlock = (language) => (content, node, options) => { | |
// find <br/> elements in node and replace them with line breaks: | |
var brs = node.getElementsByTagName('br') | |
for (var i = 0; i < brs.length; i++) { | |
brs[i].textContent = '\n' | |
} | |
var code = node.firstChild.textContent | |
var fence = '```'; | |
return ( | |
'\n\n' + fence + language + '\n' + | |
code.replace(/\n$/, '') + | |
'\n' + fence + '\n\n' | |
) | |
} | |
turndownService.addRule('phpCodeBlock', { | |
filter: function (node, options) { | |
return ( | |
options.codeBlockStyle === 'fenced' && | |
node.className === 'phpcode' && | |
node.firstChild && | |
node.firstChild.nodeName === 'PRE' && | |
node.firstChild.firstChild && | |
node.firstChild.firstChild.nodeName === 'CODE' | |
) | |
}, | |
replacement: replaceWithFencedCodeBlock('php') | |
}); | |
turndownService.addRule('phpCodeExample', { | |
filter: function (node, options) { | |
return ( | |
options.codeBlockStyle === 'fenced' && | |
node.nodeName === 'PRE' && | |
node.className === 'examplescode' | |
) | |
}, | |
replacement: replaceWithFencedCodeBlock('') | |
}); | |
// Ensure output directory exists | |
if (!fs.existsSync(outputDir)) { | |
fs.mkdirSync(outputDir); | |
} | |
// Read all HTML files | |
const files = fs.readdirSync(inputDir).filter(file => file.endsWith('.html')); | |
console.log(`Total files to process: ${files.length}`); | |
// Process each file | |
files.forEach((file, index) => { | |
const inputPath = path.join(inputDir, file); | |
const outputPath = path.join(outputDir, file.replace(/\.html$/, '.md')); | |
// Read HTML content | |
var htmlContent = fs.readFileSync(inputPath, 'utf8'); | |
// replace `<code>` with `<pre><code>` in PHP code samples: | |
htmlContent = htmlContent.replace( | |
/<div class="phpcode"><code>(.*?)<\/code><\/div>/g, | |
'<div class="phpcode"><pre><code>$1</code></pre></div>' | |
); | |
// Convert to Markdown | |
const markdownContent = turndownService.turndown(htmlContent); | |
// Write Markdown file | |
fs.writeFileSync(outputPath, markdownContent); | |
console.log(`[${index + 1} of ${files.length}] ${inputPath} -> ${outputPath}`); | |
}); | |
console.log('Conversion complete.'); | |
// serialize all of the markdown files into a single output file `manual.md`: | |
const markdownFiles = fs.readdirSync(outputDir).filter(file => file.endsWith('.md')); | |
const manualPath = path.join(__dirname, 'manual.md'); | |
// open the output file for writing: | |
const manualStream = fs.openSync(manualPath, 'w'); | |
// write the content of each markdown file to the output file: | |
markdownFiles.forEach((file, index) => { | |
const filePath = path.join(outputDir, file); | |
const markdownContent = fs.readFileSync(filePath, 'utf8'); | |
fs.appendFileSync(manualStream, markdownContent + "\n\n"); | |
console.log(`[${index + 1} of ${markdownFiles.length}] ${filePath} -> ${manualPath}`); | |
}); | |
// close the output file: | |
fs.closeSync(manualStream); | |
console.log('Manual file created.'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment