Last active
December 1, 2023 15:54
-
-
Save mgratch/ec801047eee7ae3bc06d717200c4196a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* | |
Plugin Name: CSV URL Processor | |
Plugin URI: https://gist.github.com/mgratch/ec801047eee7ae3bc06d717200c4196a | |
Description: A WP-CLI command to process URLs from a CSV file. | |
Version: 1.0 | |
Author: Marc Gratch | |
Author URI: https://marcgratch.com | |
*/ | |
if ( defined( 'WP_CLI' ) && WP_CLI ) { | |
/** | |
* Class for WP-CLI command to process URLs from CSV. | |
*/ | |
class CSV_URL_Processor_Command extends WP_CLI_Command { | |
/** | |
* Processes the CSV file and outputs the path to the failed URLs file. | |
* | |
* ## OPTIONS | |
* | |
* <file> | |
* : The path to the CSV file. | |
* | |
* ## EXAMPLES | |
* | |
* wp csv_process file_path.csv | |
* | |
* @when after_wp_load | |
* | |
* @param array $args The positional arguments. | |
* @param array $assoc_args The associative arguments. | |
*/ | |
public function __invoke( $args, $assoc_args ) { | |
global $wpdb; | |
list( $file_path ) = $args; | |
// Truncate $wpdb->prefix . 'redirection_items' table. | |
$wpdb->query( "TRUNCATE TABLE {$wpdb->prefix}redirection_items" ); | |
// flush rewrite rules via cli. | |
WP_CLI::runcommand( 'rewrite flush' ); | |
// clear cache and transients with cli. | |
WP_CLI::runcommand( 'cache flush' ); | |
WP_CLI::runcommand( 'transient delete --all' ); | |
$failed_urls = $this->process_csv_file( $file_path ); | |
// Generate a timestamped filename for failed URLs. | |
$timestamp = gmdate( 'Ymd_His' ); | |
$failed_file_path = 'failed_urls_' . $timestamp . '.csv'; | |
$failed_handle = fopen( $failed_file_path, 'w' ); | |
foreach ( $failed_urls as $failed_row ) { | |
fputcsv( $failed_handle, $failed_row ); | |
} | |
fclose( $failed_handle ); | |
WP_CLI::success( 'Processed CSV. Failed URLs saved to: ' . $failed_file_path ); | |
} | |
/** | |
* Reads the CSV file, processes URLs, and returns the failed URLs. | |
* | |
* @param string $file_path The path to the CSV file. | |
* | |
* @return array | |
*/ | |
private function process_csv_file( $file_path ) { | |
$csv_data = array(); | |
$handle = fopen( $file_path, 'r' ); | |
if ( $handle ) { | |
fgetcsv( $handle ); // Skip header row. | |
while ( ( $data = fgetcsv( $handle ) ) !== false ) { | |
$csv_data[] = $data; | |
} | |
fclose( $handle ); | |
} | |
$total_rows = count( $csv_data ); | |
$progress = \WP_CLI\Utils\make_progress_bar( 'Processing URLs', $total_rows ); | |
$redirects = array(); | |
$failed_urls = array(); | |
$source_to_destination = array_column( $csv_data, 1, 0 ); | |
foreach ( $csv_data as $index => $row ) { | |
$source_url = $row[0]; | |
$destination_url = $row[1]; | |
WP_CLI::log( "Processing URL $index: $source_url -> $destination_url" ); | |
// Resolve the final destination URL for the current source URL. | |
$final_destination_url = $this->resolve_final_destination( $destination_url, $source_to_destination ); | |
// Store the source URL with its final destination URL. | |
$redirects[ $source_url ] = $final_destination_url; | |
$progress->tick(); | |
} | |
$progress->finish(); | |
WP_CLI::log( 'Completed processing of CSV file. Now validating and creating redirects.' ); | |
// Initialize a new progress bar for redirects validation. | |
$progress = \WP_CLI\Utils\make_progress_bar( 'Validating and Creating Redirects', count( $redirects ) ); | |
// Validate and create redirects. | |
foreach ( $redirects as $source => $destination ) { | |
WP_CLI::log( "Validating URL: $source -> $destination" ); | |
// If it is a relative domain, prepend the site URL. | |
if ( ! str_contains( $destination, 'http' ) ) { | |
$site_url = get_site_url(); | |
// remove trailing slash from site URL. | |
$site_url = rtrim( $site_url, '/' ); | |
$abs_url = $site_url . $destination; | |
} else { | |
$abs_url = $destination; | |
} | |
$response = wp_remote_get( | |
$abs_url, | |
array( | |
'timeout' => 10, | |
'sslverify' => false, | |
) | |
); | |
$code = wp_remote_retrieve_response_code( $response ); | |
if ( 200 === $code ) { | |
$item = array( | |
'url' => trim( $source ), | |
'action_data' => array( 'url' => trim( $destination ) ), | |
'regex' => false, | |
'group_id' => 1, | |
'match_type' => 'url', | |
'action_type' => 'url', | |
'action_code' => 301, | |
); | |
$created = Red_Item::create( $item ); | |
if ( is_wp_error( $created ) ) { | |
WP_CLI::warning( "Failed to create redirect for: $source -> $destination. Error: " . $created->get_error_message() ); | |
$failed_urls[] = array( $source, $destination, '500', $created->get_error_message() ); | |
} else { | |
WP_CLI::success( "Redirect created for: $source -> $destination" ); | |
} | |
} else { | |
$message = wp_remote_retrieve_response_message( $response ); | |
WP_CLI::warning( "URL validation failed for: $source -> $abs_url. HTTP Status: $code $message" ); | |
$failed_urls[] = array( $source, $destination, $code, $message ); | |
} | |
$progress->tick(); | |
} | |
$progress->finish(); | |
return $failed_urls; | |
} | |
/** | |
* Resolves the final destination URL for a given source URL by following the redirect chain. | |
* | |
* @param string $destination_url The initial destination URL to resolve. | |
* @param array $source_to_destination An associative array mapping source URLs to their immediate destinations. | |
* @return string The resolved final destination URL. | |
*/ | |
private function resolve_final_destination( string $destination_url, array $source_to_destination ): string { | |
$domain = '4cornerresources.com'; | |
// Keep following the destination URL until it's not found as a source URL in the array. | |
while ( $this->url_exists_in_column_a( $destination_url, $source_to_destination, $domain ) ) { | |
// make sure the destination URL is normalized before checking for existence in the array. | |
$destination_url = $this->normalize_url( $destination_url, $domain ); | |
$destination_url = $source_to_destination[ $destination_url ]; | |
} | |
return $destination_url; | |
} | |
/** | |
* Checks if the normalized URL exists in the source column, accounting for trailing slashes. | |
* | |
* @param string $url The URL to check for existence as a source URL. | |
* @param array $source_to_destination An associative array mapping source URLs to their immediate destinations. | |
* @param string $domain The domain within which to normalize URLs. | |
* @return bool True if the URL exists as a source, false otherwise. | |
*/ | |
private function url_exists_in_column_a( string $url, array $source_to_destination, string $domain ): bool { | |
// Check the URL with and without a trailing slash. | |
$normalized_url = $this->normalize_url( $url, $domain ); | |
$normalized_url_with_slash = rtrim( $normalized_url, '/' ) . '/'; | |
return array_key_exists( $normalized_url, $source_to_destination ) || array_key_exists( $normalized_url_with_slash, $source_to_destination ); | |
} | |
/** | |
* Normalizes a URL by stripping away the domain and protocol if it belongs to the specified domain. | |
* Returns the URL path if it is a relative URL or belongs to the specified domain. | |
* Returns the original URL if it's absolute and not of the specified domain. | |
* | |
* @param string $url The URL to normalize. | |
* @param string $domain The domain within which to normalize URLs. | |
* @return string The normalized URL path or the original URL. | |
*/ | |
private function normalize_url( string $url, string $domain ): string { | |
// Parse the URL to get components. | |
$parsed_url = wp_parse_url( $url ); | |
$host = isset( $parsed_url['host'] ) ? $parsed_url['host'] : ''; | |
// If the URL is relative or the host matches the domain, normalize it. | |
if ( empty( $host ) || str_contains( $host, $domain ) ) { | |
// Ensure the path is set and prepend a slash if it's not there. | |
$path = isset( $parsed_url['path'] ) ? $parsed_url['path'] : ''; | |
return '/' . ltrim( $path, '/' ); | |
} | |
// Return the original URL if it's absolute and not of the specified domain. | |
return $url; | |
} | |
} | |
WP_CLI::add_command( 'csv_process', 'CSV_URL_Processor_Command' ); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment