Created
November 9, 2020 08:05
-
-
Save briandfoy/63eacd366f6157eca5d122c55f134e61 to your computer and use it in GitHub Desktop.
Turn a WordPress XML dump into a bunch of local files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!perl | |
use v5.10; | |
use experimental qw(signatures); | |
use File::Path qw(make_path); | |
use File::Spec::Functions; | |
use Mojo::DOM; | |
use Mojo::URL; | |
use Mojo::Util qw(dumper); | |
use XML::LibXML; | |
my $xml = do { | |
local $/; | |
my $file = $ARGV[0]; | |
open my $fh, '<:raw', $file or die "Could not open $file: $!"; | |
<$fh>; | |
}; | |
my $dom = XML::LibXML->load_xml( string => $xml ); | |
my @items = map { | |
my $node = $_; | |
my %hash = map { | |
$plain = s/\A\w+://r; | |
$plain => '' . $node->findnodes( $_ ); | |
} qw( | |
title link dc:creator wp:post_id wp:post_date_gmt | |
wp:post_name wp:status wp:post_type content:encoded ); | |
my @categories = map { | |
my %hash; | |
$hash{type} = $_->getAttribute( 'domain' ); | |
$hash{nicename} = $_->getAttribute( 'nicename' ); | |
\%hash; | |
} $node->findnodes( 'category' ); | |
$hash{categories} = \@categories; | |
\%hash; | |
} $dom->findnodes( '/rss/channel/item' ); | |
my $base = Mojo::URL->new( $items[0]{link} )->host; | |
my $local_dir = catfile( $base, '_posts' ); | |
make_path $local_dir; | |
ITEM: foreach my $item ( @items ) { | |
next if $item->{status} eq 'draft'; | |
my $url = Mojo::URL->new( $item->{link} ); | |
my $path = $url->path; | |
my $file = $path =~ s|/|-|gr; | |
$file =~ s/\A-|-\z//g; | |
my $path = catfile( $local_dir, "$file.html" ); | |
open my $fh, '>:encoding(UTF-8)', $path or do { | |
warn "Could not open <$index>: $!"; | |
next ITEM; | |
}; | |
my @categories = map { $_->{nicename} } | |
grep { $_->{type} eq 'category' } | |
$item->{categories}->@*; | |
my @tags = map { $_->{nicename} } | |
grep { $_->{type} eq 'post_tag' } | |
$item->{categories}->@*; | |
say $fh <<~"HERE"; | |
<!-- | |
title: $item->{title} | |
link: $item->{link} | |
author: $item->{creator} | |
post_id: $item->{post_id} | |
date: $item->{post_date_gmt} | |
post_name: $item->{post_name} | |
status: $item->{status} | |
type: $item->{post_type} | |
categories: @categories | |
tags: @tags | |
--> | |
HERE | |
say $fh $item->{encoded}; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment