Created
November 1, 2024 14:58
-
-
Save RichardKanshen/087d2c8ea1e375411e57bebef160a5e8 to your computer and use it in GitHub Desktop.
Wordpress XML Export 2 JSON + HTML + MD
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"dependencies": { | |
"turndown": "^7.2.0", | |
"xml2js": "^0.6.2" | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs'); | |
const xml2js = require('xml2js'); | |
const path = require('path'); | |
const TurndownService = require('turndown'); | |
async function parseWordpressXML(filename) { | |
try { | |
const xmlData = fs.readFileSync(filename, 'utf8'); | |
const parser = new xml2js.Parser({ | |
explicitArray: false, | |
mergeAttrs: true, | |
trim: true | |
}); | |
const result = await new Promise((resolve, reject) => { | |
parser.parseString(xmlData, (err, parsed) => { | |
if (err) reject(err); | |
resolve(parsed); | |
}); | |
}); | |
const turndownService = new TurndownService(); | |
const outputDir = path.join(process.cwd(), path.basename(filename) + '_wordpress_export'); | |
const postsDir = path.join(outputDir, 'posts'); | |
const metadataDir = path.join(outputDir, 'metadata'); | |
if (!fs.existsSync(outputDir)) fs.mkdirSync(outputDir); | |
if (!fs.existsSync(postsDir)) fs.mkdirSync(postsDir); | |
if (!fs.existsSync(metadataDir)) fs.mkdirSync(metadataDir); | |
const authorsMap = {}; | |
const authors = Array.isArray(result.rss.channel['wp:author']) | |
? result.rss.channel['wp:author'] | |
: [result.rss.channel['wp:author']]; | |
const processedAuthors = authors.map(author => { | |
const authorData = { | |
id: author['wp:author_id'], | |
login: author['wp:author_login'], | |
email: author['wp:author_email'], | |
firstName: author['wp:author_first_name'], | |
lastName: author['wp:author_last_name'], | |
displayName: author['wp:author_display_name'], | |
fullName: `${author['wp:author_first_name']} ${author['wp:author_last_name']}`.trim() | |
}; | |
authorsMap[author['wp:author_login']] = authorData; | |
return authorData; | |
}); | |
fs.writeFileSync( | |
path.join(metadataDir, 'authors.json'), | |
JSON.stringify(processedAuthors, null, 2) | |
); | |
const categories = Array.isArray(result.rss.channel['wp:category']) | |
? result.rss.channel['wp:category'] | |
: [result.rss.channel['wp:category']]; | |
const processedCategories = categories.map(cat => ({ | |
id: cat['wp:term_id'], | |
name: cat['wp:cat_name'], | |
slug: cat['wp:category_nicename'], | |
description: cat['wp:category_description'], | |
parent: cat['wp:category_parent'] | |
})); | |
fs.writeFileSync( | |
path.join(metadataDir, 'categories.json'), | |
JSON.stringify(processedCategories, null, 2) | |
); | |
const tags = Array.isArray(result.rss.channel['wp:tag']) | |
? result.rss.channel['wp:tag'] | |
: [result.rss.channel['wp:tag']]; | |
const processedTags = tags.map(tag => ({ | |
id: tag['wp:term_id'], | |
name: tag['wp:tag_name'], | |
slug: tag['wp:tag_slug'], | |
description: tag['wp:tag_description'] | |
})); | |
fs.writeFileSync( | |
path.join(metadataDir, 'tags.json'), | |
JSON.stringify(processedTags, null, 2) | |
); | |
const posts = Array.isArray(result.rss.channel.item) | |
? result.rss.channel.item.filter(item => item['wp:post_type'] === 'post') | |
: []; | |
const processedPosts = posts.map(post => { | |
const authorLogin = post['dc:creator']; | |
var processedPost = { | |
title: post.title, | |
pubDate: post.pubDate, | |
creator: authorLogin, | |
description: post.description, | |
post_id: post['wp:post_id'], | |
post_date_gmt: post['wp:post_date_gmt'], | |
post_modified_gmt: post['wp:post_modified_gmt'], | |
status: post['wp:status'], | |
}; | |
if (post.category && (Array.isArray(post.category) ? post.category.filter(c => c.domain === 'category').map(c => c._).length > 0 : [post.category._].filter(c => c.domain === 'category').length > 0)) { | |
processedPost.categories = Array.isArray(post.category) ? post.category.filter(c => c.domain === 'category').map(c => c._) : [post.category._].filter(c => c.domain === 'category'); | |
} | |
if (post.category && (Array.isArray(post.category) ? post.category.filter(t => t.domain === 'post_tag').map(t => t._).length > 0 : [post.category._].filter(t => t.domain === 'post_tag').length > 0)) | |
processedPost.tags = Array.isArray(post.category) ? post.category.filter(t => t.domain === 'post_tag').map(t => t._) : [post.category._].filter(t => t.domain === 'post_tag'); | |
const sanitizedTitle = post.title | |
.normalize('NFD') // Normalize Unicode characters | |
.replace(/[\u0300-\u036f]/g, '') // Remove diacritical marks | |
.replace(/[^a-z0-9\s-]/gi, '') // Remove special characters except spaces and hyphens | |
.replace(/\s+/g, '-') // Replace spaces with hyphens | |
.toLowerCase() | |
.substring(0, 100); // Optional: limit filename length | |
const htmlFilename = path.join(postsDir, `${sanitizedTitle}_${post['wp:post_id']}.html`); | |
fs.writeFileSync(htmlFilename, post['content:encoded']); | |
const markdownFilename = path.join(postsDir, `${sanitizedTitle}_${post['wp:post_id']}.md`); | |
const markdownContent = turndownService.turndown(post['content:encoded']); | |
fs.writeFileSync(markdownFilename, markdownContent); | |
return processedPost; | |
}); | |
const jsonFilename = path.join(outputDir, 'wordpress_posts.json'); | |
fs.writeFileSync(jsonFilename, JSON.stringify(processedPosts, null, 2)); | |
console.log(`Processed ${processedPosts.length} posts`); | |
console.log(`Processed ${processedAuthors.length} authors`); | |
console.log(`Processed ${processedCategories.length} categories`); | |
console.log(`Processed ${processedTags.length} tags`); | |
console.log(`Output saved to ${outputDir}`); | |
return { | |
posts: processedPosts, | |
authors: processedAuthors, | |
categories: processedCategories, | |
tags: processedTags | |
}; | |
} catch (error) { | |
console.error('Error parsing WordPress XML:', error); | |
throw error; | |
} | |
} | |
const filename = process.argv[2]; | |
if (!filename) { | |
console.error('Please provide a WordPress XML export file'); | |
console.error('Usage: node script.js <filename>'); | |
process.exit(1); | |
} | |
parseWordpressXML(filename) | |
.then(() => console.log('Export complete')) | |
.catch(err => console.error('Export failed:', err)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment