Last active
November 7, 2024 08:18
-
-
Save vishalkakadiya/30aac46bcd3342e88ef8382ed041f048 to your computer and use it in GitHub Desktop.
WordPress Migration: Convert HTML doms into the common Gutenberg blocks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* This snippet is useful while doing migration for the WordPress. | |
* Basically this script will convert the HTML dom into the common Gutenberg block comments, | |
* so the tags like Pagragraph will be converted into the "<!-- wp:paragraph -->SOME DATA<!-- /wp:paragraph -->". | |
* | |
* This script supports paragraph, headings, list tags and many more. | |
*/ | |
/** | |
* Parse the post content to convert to Gutenberg blocks. | |
* | |
* @param \DOMNode $dom_node DOMNode object. | |
* @param string $parent_tag_start Parent tag content start string. | |
* @param string $parent_tag_end Parent tag content end string. | |
* | |
* @return string | |
*/ | |
function parse_dom_node( \DOMNode $dom_node, $parent_tag_start = '', $parent_tag_end = '' ) { | |
$content = ''; | |
$skip_content = false; | |
// phpcs:disable WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase | |
if ( ! empty( $dom_node->childNodes ) ) { | |
foreach ( $dom_node->childNodes as $node ) { | |
if ( in_array( $node->nodeName, [ 'html', 'body', 'head' ], true ) ) { | |
$content .= $this->parse_dom_node( $node ); | |
} elseif ( '#text' === $node->nodeName && $node->nodeValue ) { | |
$content .= $node->nodeValue; | |
} elseif ( $node->hasChildNodes() ) { | |
$block_start = ''; | |
$block_end = ''; | |
$tag_attrs = ''; | |
// Identify the tag names and convert to Gutenberg block markup. | |
if ( 'p' === $node->nodeName ) { | |
if ( 'blockquote' === $node->parentNode->nodeName && 'twitter-tweet' === $node->parentNode->getAttribute( 'class' ) ) { | |
$content = $this->parse_dom_node( $node ); | |
continue; | |
} elseif ( 'blockquote' !== $node->parentNode->nodeName ) { | |
$block_start = '<!-- wp:paragraph -->'; | |
$block_end = '<!-- /wp:paragraph -->'; | |
} | |
} elseif ( in_array( $node->nodeName, [ 'ol', 'ul' ], true ) ) { | |
$block_start = '<!-- wp:list -->'; | |
$block_end = '<!-- /wp:list -->'; | |
} elseif ( in_array( $node->nodeName, [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ], true ) ) { | |
$level = false; | |
if ( 'h1' === $node->nodeName ) { | |
$level = 1; | |
} elseif ( 'h3' === $node->nodeName ) { | |
$level = 3; | |
} elseif ( 'h4' === $node->nodeName ) { | |
$level = 4; | |
} elseif ( 'h5' === $node->nodeName ) { | |
$level = 5; | |
} elseif ( 'h6' === $node->nodeName ) { | |
$level = 6; | |
} | |
if ( $level ) { | |
$block_start = '<!-- wp:heading {"level":' . $level . '} -->'; | |
} else { | |
$block_start = '<!-- wp:heading -->'; | |
} | |
$block_end = '<!-- /wp:heading -->'; | |
} elseif ( 'a' === $node->nodeName ) { | |
$link = $node->getAttribute( 'href' ); | |
if ( 'p' === $node->parentNode->nodeName && 'img' === $node->firstChild->nodeName ) { | |
$content = $parent_tag_start . $content . $parent_tag_end; | |
$skip_content = true; | |
} | |
if ( ! empty( $link ) ) { | |
$content = sprintf( '%1$s <a href="%2$s">%3$s</a>', $content, esc_url( $link ), $node->nodeValue ); | |
continue; | |
} | |
if ( '_blank' === $node->getAttribute( 'target' ) ) { | |
$tag_attrs .= ' rel="noreferrer noopener" aria-label=" (opens in a new tab)"'; | |
} | |
} elseif ( 'blockquote' === $node->nodeName ) { | |
if ( ! $node->nodeValue ) { | |
$skip_content = true; | |
continue; | |
} elseif ( 'twitter-tweet' === $node->getAttribute( 'class' ) ) { | |
$content .= $this->parse_dom_node( $node ); | |
continue; | |
} | |
$block_start = '<!-- wp:quote -->'; | |
$block_end = '<!-- /wp:quote -->'; | |
$tag_attrs = ' class="wp-block-quote"'; | |
} elseif ( 'pre' === $node->nodeName ) { | |
$block_start = '<!-- wp:preformatted -->'; | |
$block_end = '<!-- /wp:preformatted -->'; | |
$tag_attrs = ' class="wp-block-preformatted"'; | |
} elseif ( 'object' === $node->nodeName ) { | |
$content .= $this->parse_dom_node( $node ); | |
continue; | |
} elseif ( 'strong' === $node->nodeName ) { | |
if ( 'p' === $node->firstChild->nodeName ) { | |
$content .= $this->parse_dom_node( $node ); | |
continue; | |
} | |
} elseif ( 'div' === $node->nodeName ) { | |
$container_classes = $node->getAttribute( 'class' ); | |
$div_start = ''; | |
$div_end = ''; | |
if ( preg_match( '~(field-name-field-see-related)~', $container_classes ) ) { | |
$div_start = '<!-- wp:paragraph -->'; | |
$div_end = '<!-- /wp:paragraph -->'; | |
} | |
$content .= $this->parse_dom_node( $node, $div_start, $div_end ); | |
continue; | |
} | |
// Final content string. | |
$tag_start = "{$block_start}<{$node->nodeName}{$tag_attrs}>"; | |
if ( 'ol' === $node->nodeName ) { | |
$tag_start = "{$block_start}<ul{$tag_attrs}>"; | |
} | |
$tag_end = "</{$node->nodeName}>{$block_end}"; | |
if ( 'ol' === $node->nodeName ) { | |
$tag_end = "</ul>{$block_end}"; | |
} | |
$content .= $this->parse_dom_node( $node, $tag_start, $tag_end ); | |
} else { | |
$block_start = ''; | |
$block_end = ''; | |
if ( 'script' === $node->nodeName ) { | |
$src = $node->getAttribute( 'src' ); | |
if ( false !== strpos( $src, 'playbuzz.com' ) ) { | |
$tag_attrs = ' type="text/javascript" src="' . $src . '"'; | |
$content .= '<!-- wp:html -->'; | |
$content .= "<{$node->nodeName}{$tag_attrs} />"; | |
} | |
} elseif ( 'param' === $node->nodeName ) { | |
continue; | |
} elseif ( 'img' === $node->nodeName ) { | |
$parsed_url = wp_parse_url( $node->getAttribute( 'src' ) ); | |
$img_url = isset( $parsed_url['path'] ) ? $parsed_url['path'] : ''; | |
$download_image = []; | |
if ( false !== strpos( $node->getAttribute( 'src' ), '/media/' ) ) { | |
$img_url = str_replace( '/media/', $this->original_site_domain . '/media/', $img_url ); | |
$download_image = download_inline_image( $img_url ); | |
} | |
if ( isset( $download_image['url'] ) && ! empty( $download_image['url'] ) ) { | |
$tag_attrs = ' src="' . $download_image['url'] . '" alt="' . $node->getAttribute( 'alt' ) . '"'; | |
$block_start = '<!-- wp:image -->'; | |
$block_start .= '<figure class="wp-block-image">'; | |
if ( 'p' === $node->parentNode->nodeName ) { | |
$content = $parent_tag_start . $content . $parent_tag_end; | |
$skip_content = true; | |
} elseif ( 'a' === $node->parentNode->nodeName ) { | |
$block_start .= $parent_tag_start; | |
$block_end .= $parent_tag_end; | |
$skip_content = true; | |
} | |
$block_end .= '</figure>'; | |
$block_end .= '<!-- /wp:image -->'; | |
$content .= "{$block_start}<{$node->nodeName}{$tag_attrs} />{$block_end}"; | |
} | |
} elseif ( 'hr' === $node->nodeName ) { | |
$block_start = '<!-- wp:separator -->'; | |
$block_end = '<!-- /wp:separator -->'; | |
$tag_attrs = ' class="wp-block-separator"'; | |
$content .= "{$block_start}<{$node->nodeName}{$tag_attrs} />{$block_end}"; | |
} | |
} | |
} | |
if ( ! $skip_content ) { | |
$content = $parent_tag_start . $content . $parent_tag_end; | |
} | |
} | |
// phpcs:enable WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase | |
return $content; | |
} | |
/** | |
* Download inline image and return new path. | |
* | |
* @param string $url The image URL. | |
* | |
* @return array | |
*/ | |
function download_inline_image( $url ) { | |
$url = $this->get_img_url( $url ); | |
$attach_id = $this->get_media_id_from_url( $url ); | |
$already_exists = false; | |
if ( ! $attach_id ) { | |
$name = basename( $url ); | |
// Download the image. | |
$tmp_file = download_url( $url, 15 ); | |
if ( is_wp_error( $tmp_file ) ) { | |
\WP_CLI::log( "\nERROR: Unable to download image: {$url}" ); | |
} else { | |
$file = [ | |
'name' => $name, | |
'tmp_name' => $tmp_file, | |
'error' => 0, | |
'size' => filesize( $tmp_file ), | |
]; | |
$attach_id = media_handle_sideload( $file ); | |
if ( is_wp_error( $attach_id ) ) { | |
\WP_CLI::log( "\nERROR: Unable to open output file: {$url}" ); | |
} | |
} | |
} else { | |
$already_exists = true; | |
} | |
if ( ! is_wp_error( $attach_id ) && ! empty( $attach_id ) ) { | |
if ( ! $already_exists ) { | |
set_media_meta( $attach_id, $url ); | |
} | |
return [ | |
'url' => get_post_field( 'guid', $attach_id ), | |
'id' => $attach_id, | |
]; | |
} | |
return []; | |
} | |
/** | |
* Update attachment meta with original url. | |
* | |
* @param int $attach_id Attachment id. | |
* @param string $url Original URL. | |
*/ | |
function set_media_meta( $attach_id, $url ) { | |
update_post_meta( $attach_id, '_original_image_url', $url ); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment