Created
February 18, 2020 16:55
-
-
Save ChrisHardie/f356b2705e6659f0ec9f0b826ca87650 to your computer and use it in GitHub Desktop.
Flickr-to-WordPress: a plugin to use a WordPress powered API to find/replace Flickr references in another WordPress site
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Plugin Name: Flickr Fixer | |
* Plugin URI: https://chrishardie.com/ | |
* Description: Find/replace Flickr references | |
* Version: 1.0 | |
* Author: Chris Hardie | |
* Author URI: https://chrishardie.com/ | |
*/ | |
if ( ! defined( 'WP_CLI' ) || ! WP_CLI ) { | |
return; | |
} | |
class JCH_Flickr_Fixer extends WP_CLI_Command { | |
/** | |
* Fix Flickr References | |
* | |
* ## OPTIONS | |
* | |
* [--post_id=<id>] | |
* : Specify an individual post ID to work with | |
* | |
* [--dry_run=false] | |
* : Actually update post content in a database write operation | |
* | |
* [--debug] | |
* : Display debug output | |
* | |
* @subcommand fix-refs [--dry_run=false] [--post_id=<id>] [--debug] | |
* | |
* @param array $args Args. | |
* @param array $args_assoc Associative args. | |
*/ | |
public function fix_refs( $args, $assoc_args ) { | |
global $wpdb; | |
// Get a specific post, or all the posts. | |
if ( isset( $assoc_args['post_id'] ) && is_numeric( $assoc_args['post_id'] ) ) { | |
WP_CLI::line( 'Getting selected post...' ); | |
$flickr_post_ids = $wpdb->get_col( "SELECT ID FROM {$wpdb->posts} WHERE post_type = 'post' AND ID = " . (int) $assoc_args['post_id'] ); | |
} else { | |
WP_CLI::line( 'Getting all posts containing Flickr references...' ); | |
$flickr_post_ids = $wpdb->get_col( "SELECT ID FROM {$wpdb->posts} WHERE post_type = 'post' AND ( post_content LIKE '%%" . esc_sql( $wpdb->esc_like( 'flickr.com/photos/chrishardie' ) ) . "%%' OR post_content LIKE '%%" . esc_sql( $wpdb->esc_like( 'flickr.com/photos/11288301@N00' ) ) . "%%' )" ); | |
} | |
$flickr_post_count = count( $flickr_post_ids ); | |
WP_CLI::line( 'Found ' . (int) $flickr_post_count . ' posts to process.' ); | |
if ( ! $flickr_post_count ) { | |
WP_CLI::line( 'No work to do! Exiting.' ); | |
exit; | |
} | |
$progress = \WP_CLI\Utils\make_progress_bar( 'Processing Flickr references', $flickr_post_count ); | |
// Keep track of all replacements across all posts. | |
$replacements_made = 0; | |
// Work on 20 posts at a time. | |
while ( $post_ids = array_splice( $flickr_post_ids, 0, 20 ) ) { | |
$posts = $wpdb->get_results( "SELECT ID, post_content FROM {$wpdb->posts} WHERE post_type = 'post' AND ID IN (" . implode( ',', $post_ids ) . ')' ); | |
if ( empty( $posts ) ) { | |
break; | |
} | |
foreach ( $posts as $the_post ) { | |
// I couldn't get edit_post_link() to work. This is dumb. | |
$admin_edit_url = admin_url( 'post.php?post=' ) . $the_post->ID . '&action=edit'; | |
WP_CLI::debug( "Working on Post ID: {$the_post->ID}: " . $admin_edit_url ); | |
// We don't end up using the $original_content variable but you could use it for an extra comparison check if needed. | |
$original_content = $the_post->post_content; | |
$new_content = $original_content; | |
// Keep track of the number of Flickr references found in this post. | |
$ref_count = 0; | |
// Despite all this fancy automation, there are still going to be references that need special/manual handling. | |
// This array is for that. The key is the string you want to find, the value is the string to replace it with. | |
$manual_find_replace = array( | |
'11288301@N00' => 'chrishardie', // Older references to my Flickr user ID | |
'http://www.flickr.com/photos/chrishardie/sets/' => 'https://photos.chrishardie.com/flickrset/', // References to Flickr sets | |
'https://www.flickr.com/photos/chrishardie/sets/' => 'https://photos.chrishardie.com/flickrset/', | |
'http://www.flickr.com/photos/chrishardie/tags/' => 'https://photos.chrishardie.com/tag/', // Flickr tags | |
'https://www.flickr.com/photos/chrishardie/tags/' => 'https://photos.chrishardie.com/tag/', | |
'href="http://www.flickr.com/photos/chrishardie"' => 'href="https://photos.chrishardie.com/"', // Plain old links to my profile | |
'href="http://www.flickr.com/photos/chrishardie/"' => 'href="https://photos.chrishardie.com/"', | |
); | |
// Given those manual items, fix them quickly and move on. | |
foreach ( $manual_find_replace as $key => $manual_ref ) { | |
$new_content = str_replace( $key, $manual_ref, $new_content, $manual_replacements_made ); | |
if ( 0 < $manual_replacements_made ) { | |
$ref_count++; | |
$replacements_made += $manual_replacements_made; | |
} | |
} | |
// Now for the fun stuff. | |
// Find <a> references with or without <img> references inside | |
preg_match_all( '/(<a[^>]+?href="https?:\/\/www.flickr.com\/photos\/chrishardie\/\d+.+?[<img.+?\/>]?<\/a>)/', $new_content, $refs_complex, PREG_SET_ORDER ); | |
foreach ( $refs_complex as $ref ) { | |
$ref_count++; | |
// It would be too hard to use regexp parsing given all the different ways the attributes of a tag can be ordered. | |
// Instead, use DOMDocument to programmatically parse the tag attributes and update them. | |
$dom = new DOMDocument(); | |
// Get the HTML of the pattern we matched above. | |
$dom->loadHTML( $ref[0] ); | |
// Get the A and IMG tags into an array | |
$a_tags = $dom->getElementsByTagName( 'a' ); | |
$img_tags = $dom->getElementsByTagName( 'img' ); | |
// We're only expecting one link and one image per match. | |
if ( 1 < count( $a_tags ) || 1 < count( $img_tags ) ) { | |
WP_CLI::warning( 'More than one a tag or img tag: ' . $admin_edit_url ); | |
} else { | |
// Extract the Flickr link in particular we want to replace | |
$flickr_link = preg_replace( '/(https?:\/\/www\.flickr\.com\/photos\/chrishardie\/\d+\/)(in\/\S+)?/', '$1', $a_tags[0]->getAttribute( 'href' ) ); | |
// For our API lookup, add a trailing slash if there isn't one | |
$flickr_link = rtrim( $flickr_link, '/' ) . '/'; | |
// Get width and height for the img tag, if there is one | |
// This allows us to retrieve an img src value appropriate for replacing, instead of full size | |
if ( ! empty( $img_tags[0] ) ) { | |
$width = $img_tags[0]->getAttribute( 'width' ); | |
$height = $img_tags[0]->getAttribute( 'height' ); | |
} else { | |
$width = null; | |
$height = null; | |
} | |
// Look it up on our WordPress photo site | |
$photo_data = $this->get_photo_post_by_flickr_id( $flickr_link, $width, $height ); | |
// See if we got something back | |
if ( ! $photo_data ) { | |
WP_CLI::warning( 'Could not find a valid photo site post for ' . $flickr_link . ' in ' . $admin_edit_url ); | |
continue; | |
} else { | |
WP_CLI::debug( 'Found valid photo site post for Flickr URL.' ); | |
WP_CLI::debug( $flickr_link . ' --> ' . $photo_data->permalink ); | |
// Replace it | |
$a_tags[0]->setAttribute( 'href', $photo_data->permalink ); | |
$dom->saveHTML( $a_tags[0] ); | |
$replacements_made++; | |
// If we're working with an img tag, update it too | |
if ( ! empty( $img_tags[0] ) ) { | |
if ( empty( $photo_data->thumbnail_url ) ) { | |
WP_CLI::warning( 'No valid replacement image src url in ' . $admin_edit_url ); | |
} else { | |
$old_img_src = $img_tags[0]->getAttribute( 'src' ); | |
WP_CLI::debug( $old_img_src . ' --> ' . $photo_data->thumbnail_url ); | |
$img_tags[0]->setAttribute( 'src', $photo_data->thumbnail_url ); | |
$dom->saveHTML( $img_tags[0] ); | |
$replacements_made++; | |
} | |
} | |
# Strip out the DOCTYPE, html & body tags that DOMDocument adds to the HTML doc | |
$final_html = preg_replace( '~<(?:!DOCTYPE|/?(?:html|body))[^>]*>\s*~i', '', $dom->saveHTML() ); | |
// Update the content variable with the right HTML | |
$new_content = str_replace( $ref[0], $final_html, $new_content ); | |
} | |
} | |
} | |
// Find remaining references - usually a link on a line by itself for oembed use, or inside an <a> tag | |
preg_match_all( '/(https?:\/\/www\.flickr\.com\/photos\/chrishardie\/\d+\/)(in\S+)?/', $new_content, $refs_solo, PREG_SET_ORDER ); | |
foreach ( $refs_solo as $ref ) { | |
$ref_count++; | |
$found_flickr_ref = $ref[1]; | |
// The first array slot contains the matched Flickr photo URL | |
$photo_data = self::get_photo_post_by_flickr_id( $found_flickr_ref ); | |
if ( ! $photo_data ) { | |
WP_CLI::warning( 'Could not find a valid photo site post for ' . $found_flickr_ref . ' in ' . $admin_edit_url ); | |
continue; | |
} else { | |
WP_CLI::debug( 'Found valid photo site post for Flickr URL.' ); | |
WP_CLI::debug( $found_flickr_ref . ' --> ' . $photo_data->permalink ); | |
// Update the content | |
$new_content = str_replace( | |
$ref[0], // The original matched pattern in full | |
$photo_data->permalink, | |
$new_content, | |
$solo_replacements_made | |
); | |
$replacements_made += $solo_replacements_made; | |
} | |
} | |
// See if we found any references that we touched, so we know if an actual post content update is needed. | |
if ( 0 < $ref_count ) { | |
WP_CLI::debug( $ref_count . ' Flickr references found, so updating content...' ); | |
$post_to_update = array( | |
'ID' => $the_post->ID, | |
'post_content' => $new_content, | |
); | |
// Only do an actual database update if the dry_run=false flag is passed. | |
if ( ! empty( $assoc_args['dry_run'] ) && 'false' === $assoc_args['dry_run'] ) { | |
$update_result = wp_update_post( $post_to_update ); | |
if ( is_wp_error( $update_result ) ) { | |
WP_CLI::error( 'There was a problem updating the post content.' ); | |
} else { | |
WP_CLI::debug( 'Post successfully updated.' ); | |
clean_post_cache( $the_post->ID ); | |
WP_CLI::line( 'Updated post: ' . get_the_permalink( $the_post->ID ) ); | |
} | |
} else { | |
WP_CLI::debug( 'Dry run only, so not actually making DB changes.' ); | |
} | |
} else { | |
WP_CLI::warning( 'No references found in processing, probably something missing in ' . $admin_edit_url ); | |
} | |
$progress->tick(); | |
} | |
// Do some memory cleanup so we don't lose control. | |
self::stop_the_insanity(); | |
} | |
$progress->finish(); | |
WP_CLI::success( $replacements_made . ' replacement(s) made across all posts' ); | |
} | |
/** | |
* Clear all of the caches for memory management | |
*/ | |
public static function stop_the_insanity() { | |
/** | |
* @var \WP_Object_Cache $wp_object_cache | |
* @var \wpdb $wpdb | |
*/ | |
global $wpdb, $wp_object_cache; | |
$wpdb->queries = array(); // or define( 'WP_IMPORTING', true ); | |
if ( is_object( $wp_object_cache ) ) { | |
$wp_object_cache->group_ops = array(); | |
$wp_object_cache->stats = array(); | |
$wp_object_cache->memcache_debug = array(); | |
$wp_object_cache->cache = array(); | |
if ( method_exists( $wp_object_cache, '__remoteset' ) ) { | |
$wp_object_cache->__remoteset(); // important | |
} | |
} | |
} | |
/** | |
* Look up the photo site post info via API using the Flickr ID and possibly width/height | |
*/ | |
public function get_photo_post_by_flickr_id( $flickr_url = null, $width = null, $height = null ) { | |
$cache_group = 'flickr_fixer'; | |
$cache_expire = 7 * DAY_IN_SECONDS; | |
if ( empty( $flickr_url ) ) { | |
return false; | |
} | |
// Always use the SSL version, since that's what was provided in the Flickr site data export/import process. | |
$flickr_url = preg_replace( '/^http:/i', 'https:', $flickr_url ); | |
$photo_site_api_url = 'https://my-wp-photo-website.com/wp-json/myphotos/v1/find-by-flickr-url/'; | |
$cache_key = $flickr_url; | |
// This is the main key we use to look up the presence of a Flickr photo on the new WordPress site | |
$photo_site_api_url = add_query_arg( | |
array( | |
'flickr-url' => $flickr_url, | |
), | |
$photo_site_api_url | |
); | |
// If we received a width, use it | |
if ( is_numeric( $width ) ) { | |
$photo_site_api_url = add_query_arg( | |
array( | |
'width' => $width, | |
), | |
$photo_site_api_url | |
); | |
$cache_key .= '--' . $width; | |
} | |
// If we received a height, use it | |
if ( is_numeric( $height ) ) { | |
$photo_site_api_url = add_query_arg( | |
array( | |
'height' => $height, | |
), | |
$photo_site_api_url | |
); | |
$cache_key .= '--' . $height; | |
} | |
// It makes sense to cache the API responses from our WordPress-hosted photo site, they'll rarely change. | |
WP_CLI::debug( 'Checking cache for ' . $cache_key ); | |
$cache_data = wp_cache_get( $cache_key, $cache_group, false, $found ); | |
if ( ( true === $found ) && ! empty( $cache_data ) ) { | |
WP_CLI::debug( 'Cache hit' ); | |
return $cache_data; | |
} | |
// If no cache hit, do an actual lookup via API. | |
WP_CLI::debug( 'Fetching ' . $photo_site_api_url ); | |
$photo_site_request = wp_remote_get( $photo_site_api_url ); | |
// Make sure we got a valid JSON response | |
if ( is_wp_error( $photo_site_request ) ) { | |
return false; | |
} | |
$photo_site_response = wp_remote_retrieve_body( $photo_site_request ); | |
$photo_data = json_decode( $photo_site_response, false ); | |
// If we got a valid result from the API, cache and return it | |
if ( ! empty( $photo_data ) && 'found' === $photo_data->result ) { | |
$cache_set_result = wp_cache_set( $cache_key, $photo_data, $cache_group, $cache_expire ); | |
if ( false === $cache_set_result ) { | |
WP_CLI::error( 'Failed to set the cache value for key ' . $cache_key ); | |
} | |
return $photo_data; | |
} else { | |
return false; | |
} | |
} | |
} | |
// Register this command. | |
WP_CLI::add_command( 'flickr-fixer', 'JCH_Flickr_Fixer' ); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment