Skip to content

Instantly share code, notes, and snippets.

@RichardKanshen
Created November 1, 2024 14:58
Show Gist options
  • Save RichardKanshen/087d2c8ea1e375411e57bebef160a5e8 to your computer and use it in GitHub Desktop.
Save RichardKanshen/087d2c8ea1e375411e57bebef160a5e8 to your computer and use it in GitHub Desktop.
Wordpress XML Export 2 JSON + HTML + MD
{
"dependencies": {
"turndown": "^7.2.0",
"xml2js": "^0.6.2"
}
}
const fs = require('fs');
const xml2js = require('xml2js');
const path = require('path');
const TurndownService = require('turndown');
async function parseWordpressXML(filename) {
try {
const xmlData = fs.readFileSync(filename, 'utf8');
const parser = new xml2js.Parser({
explicitArray: false,
mergeAttrs: true,
trim: true
});
const result = await new Promise((resolve, reject) => {
parser.parseString(xmlData, (err, parsed) => {
if (err) reject(err);
resolve(parsed);
});
});
const turndownService = new TurndownService();
const outputDir = path.join(process.cwd(), path.basename(filename) + '_wordpress_export');
const postsDir = path.join(outputDir, 'posts');
const metadataDir = path.join(outputDir, 'metadata');
if (!fs.existsSync(outputDir)) fs.mkdirSync(outputDir);
if (!fs.existsSync(postsDir)) fs.mkdirSync(postsDir);
if (!fs.existsSync(metadataDir)) fs.mkdirSync(metadataDir);
const authorsMap = {};
const authors = Array.isArray(result.rss.channel['wp:author'])
? result.rss.channel['wp:author']
: [result.rss.channel['wp:author']];
const processedAuthors = authors.map(author => {
const authorData = {
id: author['wp:author_id'],
login: author['wp:author_login'],
email: author['wp:author_email'],
firstName: author['wp:author_first_name'],
lastName: author['wp:author_last_name'],
displayName: author['wp:author_display_name'],
fullName: `${author['wp:author_first_name']} ${author['wp:author_last_name']}`.trim()
};
authorsMap[author['wp:author_login']] = authorData;
return authorData;
});
fs.writeFileSync(
path.join(metadataDir, 'authors.json'),
JSON.stringify(processedAuthors, null, 2)
);
const categories = Array.isArray(result.rss.channel['wp:category'])
? result.rss.channel['wp:category']
: [result.rss.channel['wp:category']];
const processedCategories = categories.map(cat => ({
id: cat['wp:term_id'],
name: cat['wp:cat_name'],
slug: cat['wp:category_nicename'],
description: cat['wp:category_description'],
parent: cat['wp:category_parent']
}));
fs.writeFileSync(
path.join(metadataDir, 'categories.json'),
JSON.stringify(processedCategories, null, 2)
);
const tags = Array.isArray(result.rss.channel['wp:tag'])
? result.rss.channel['wp:tag']
: [result.rss.channel['wp:tag']];
const processedTags = tags.map(tag => ({
id: tag['wp:term_id'],
name: tag['wp:tag_name'],
slug: tag['wp:tag_slug'],
description: tag['wp:tag_description']
}));
fs.writeFileSync(
path.join(metadataDir, 'tags.json'),
JSON.stringify(processedTags, null, 2)
);
const posts = Array.isArray(result.rss.channel.item)
? result.rss.channel.item.filter(item => item['wp:post_type'] === 'post')
: [];
const processedPosts = posts.map(post => {
const authorLogin = post['dc:creator'];
var processedPost = {
title: post.title,
pubDate: post.pubDate,
creator: authorLogin,
description: post.description,
post_id: post['wp:post_id'],
post_date_gmt: post['wp:post_date_gmt'],
post_modified_gmt: post['wp:post_modified_gmt'],
status: post['wp:status'],
};
if (post.category && (Array.isArray(post.category) ? post.category.filter(c => c.domain === 'category').map(c => c._).length > 0 : [post.category._].filter(c => c.domain === 'category').length > 0)) {
processedPost.categories = Array.isArray(post.category) ? post.category.filter(c => c.domain === 'category').map(c => c._) : [post.category._].filter(c => c.domain === 'category');
}
if (post.category && (Array.isArray(post.category) ? post.category.filter(t => t.domain === 'post_tag').map(t => t._).length > 0 : [post.category._].filter(t => t.domain === 'post_tag').length > 0))
processedPost.tags = Array.isArray(post.category) ? post.category.filter(t => t.domain === 'post_tag').map(t => t._) : [post.category._].filter(t => t.domain === 'post_tag');
const sanitizedTitle = post.title
.normalize('NFD') // Normalize Unicode characters
.replace(/[\u0300-\u036f]/g, '') // Remove diacritical marks
.replace(/[^a-z0-9\s-]/gi, '') // Remove special characters except spaces and hyphens
.replace(/\s+/g, '-') // Replace spaces with hyphens
.toLowerCase()
.substring(0, 100); // Optional: limit filename length
const htmlFilename = path.join(postsDir, `${sanitizedTitle}_${post['wp:post_id']}.html`);
fs.writeFileSync(htmlFilename, post['content:encoded']);
const markdownFilename = path.join(postsDir, `${sanitizedTitle}_${post['wp:post_id']}.md`);
const markdownContent = turndownService.turndown(post['content:encoded']);
fs.writeFileSync(markdownFilename, markdownContent);
return processedPost;
});
const jsonFilename = path.join(outputDir, 'wordpress_posts.json');
fs.writeFileSync(jsonFilename, JSON.stringify(processedPosts, null, 2));
console.log(`Processed ${processedPosts.length} posts`);
console.log(`Processed ${processedAuthors.length} authors`);
console.log(`Processed ${processedCategories.length} categories`);
console.log(`Processed ${processedTags.length} tags`);
console.log(`Output saved to ${outputDir}`);
return {
posts: processedPosts,
authors: processedAuthors,
categories: processedCategories,
tags: processedTags
};
} catch (error) {
console.error('Error parsing WordPress XML:', error);
throw error;
}
}
const filename = process.argv[2];
if (!filename) {
console.error('Please provide a WordPress XML export file');
console.error('Usage: node script.js <filename>');
process.exit(1);
}
parseWordpressXML(filename)
.then(() => console.log('Export complete'))
.catch(err => console.error('Export failed:', err));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment