Created
July 16, 2023 09:44
-
-
Save yurenju/5c7ff1d9bd090ec6fecf9575d9d05181 to your computer and use it in GitHub Desktop.
blogger-to-hugo.js
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const { XMLParser } = require("fast-xml-parser"); | |
const TurndownService = require("turndown"); | |
const moment = require("moment"); | |
const shell = require("shelljs"); | |
const fetch = require("node-fetch"); | |
const yaml = require("js-yaml"); | |
const fs = require("fs"); | |
const path = require("path"); | |
const OUT_DIR = "out"; | |
const tds = new TurndownService({ codeBlockStyle: "fenced", fence: "```" }); | |
tds.addRule("wppreblock", { | |
filter: ["pre"], | |
replacement: function (content) { | |
return "```\n" + content + "\n```"; | |
}, | |
}); | |
const parser = new XMLParser({ | |
ignoreAttributes: false, | |
}); | |
// const parseString = util.promisify(parser.parseString); | |
function normalizeFilename(title) { | |
const specialCharsMap = { | |
"<": "〈", | |
">": "〉", | |
":": ":", | |
'"': "〃", | |
"/": "/", | |
"\\": "\", | |
"|": "|", | |
"?": "?", | |
"*": "*", | |
".": ".", | |
"\0": "", | |
}; | |
return title.replace( | |
/[<>:"\/\\|?.\*\x00-\x1F]/g, | |
(match) => specialCharsMap[match] | |
); | |
} | |
async function transformImages(articleFolder, htmlContent) { | |
let modifiedHtml = htmlContent; | |
shell.mkdir("-p", `${articleFolder}/images`); | |
const pattern = /<img.*?src="(.*?)"/g; | |
let matched; | |
let i = 0; | |
for (; (matched = pattern.exec(htmlContent)) !== null; i++) { | |
const imageUrl = matched[1]; | |
try { | |
const response = await fetch(imageUrl); | |
const buffer = await response.buffer(); | |
const ext = path.extname(imageUrl); | |
const filename = `${i}${ext}`; | |
const imagePath = `${articleFolder}/images/${filename}`; | |
fs.writeFileSync(imagePath, buffer); | |
modifiedHtml = modifiedHtml.replace(imageUrl, `images/${filename}`); | |
} catch (e) { | |
console.error(e.message); | |
} | |
} | |
if (i === 0) { | |
shell.rm("-rf", `${articleFolder}/images`); | |
} | |
return modifiedHtml; | |
} | |
async function main() { | |
shell.rm("-rf", OUT_DIR); | |
shell.mkdir("-p", OUT_DIR); | |
const result = await parser.parse(shell.cat("blog-backup.xml")); | |
const posts = result.feed.entry | |
.filter((entry) => { | |
return ( | |
entry.id.indexOf(".post-") > 0 && | |
!entry.hasOwnProperty("thr:in-reply-to") | |
); | |
}) | |
.filter( | |
(entry) => | |
!(entry["app:control"] && entry["app:control"]["app:draft"] == "yes") | |
); | |
console.log(`found ${posts.length} posts`); | |
for (let i = 0; i < posts.length; i++) { | |
const post = posts[i]; | |
const date = moment(post.published).format("YYYY-MM-DD"); | |
const filenameTitle = normalizeFilename(post.title["#text"] || date); | |
const articleFolderName = `${date}_${filenameTitle}`; | |
const articleFolder = `${OUT_DIR}/${articleFolderName}`; | |
console.log(i, articleFolderName); | |
shell.mkdir("-p", articleFolder); | |
const modifiedContent = await transformImages( | |
articleFolder, | |
post.content["#text"] | |
); | |
let tags = []; | |
if (Array.isArray(post.category)) { | |
tags = post.category | |
.filter( | |
(category) => | |
category["@_scheme"] === "http://www.blogger.com/atom/ns#" | |
) | |
.map((category) => category["@_term"]); | |
} | |
const markdown = tds.turndown(modifiedContent); | |
const title = post.title["#text"] || date; | |
const categories = ["tech"]; | |
const header = yaml.dump({ title, date, tags, categories }); | |
const content = `---\n${header}---\n${markdown}`; | |
fs.writeFileSync(`${articleFolder}/index.md`, content); | |
} | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment