Last active
May 9, 2024 20:46
-
-
Save karimkawambwa/caa5fde0a2ea64b380f1f23f1c2f0b01 to your computer and use it in GitHub Desktop.
Coastpress Migration Script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
function post_exists_by_title($title) { | |
global $wpdb; | |
$query = $wpdb->prepare("SELECT ID FROM $wpdb->posts WHERE post_title = %s AND post_type = 'post' AND post_status = 'publish' LIMIT 1", $title); | |
return $wpdb->get_var($query); | |
} | |
function set_featured_image($post_id, $image_url) { | |
if (empty($image_url)) return; | |
$upload_dir = wp_upload_dir(); // WordPress upload directory | |
$image_data = file_get_contents($image_url); | |
$filename = urldecode(basename($image_url)); | |
$file_path = $upload_dir['path'] . '/' . $filename; | |
$new_file = !file_exists($file_path); | |
// Save the new file if it doesn't exist | |
file_put_contents($file_path, $image_data); | |
if ($new_file) { | |
$wp_filetype = wp_check_filetype($filename, null); | |
$attachment = array( | |
'post_mime_type' => $wp_filetype['type'], | |
'post_title' => sanitize_file_name($filename), | |
'post_content' => '', | |
'post_status' => 'inherit' | |
); | |
$attach_id = wp_insert_attachment($attachment, $file_path, $post_id); | |
// Normally you would generate metadata and use wp_update_attachment_metadata() here | |
} else { | |
// File exists, find the attachment ID | |
$attach_id = attachment_url_to_postid($upload_dir['url'] . '/' . $filename); | |
} | |
// Set the found or new attachment as the featured image | |
set_post_thumbnail($post_id, $attach_id); | |
} | |
function fetch_full_post_content($url) { | |
if (!$url) return 'No content found'; | |
// Initialize cURL session | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_URL, $url); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MyScraperBot/1.0; +http://example.com/bot)'); | |
// Execute cURL session and close it | |
$html = curl_exec($ch); | |
curl_close($ch); | |
$doc = new DOMDocument(); | |
libxml_use_internal_errors(true); | |
$doc->loadHTML($html); | |
libxml_clear_errors(); | |
$xpath = new DOMXPath($doc); | |
$contentNode = $xpath->query("//div[contains(@class, 'post-body')]"); | |
return $contentNode->item(0) ? $contentNode->item(0)->C14N() : 'No content found'; | |
} | |
function fetch_posts($url) { | |
// Initialize cURL session | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_URL, $url); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MyScraperBot/1.0; +http://example.com/bot)'); | |
// Execute cURL session and close it | |
$html = curl_exec($ch); | |
curl_close($ch); | |
$doc = new DOMDocument(); | |
libxml_use_internal_errors(true); // Suppress libXML errors | |
$doc->loadHTML($html); | |
libxml_clear_errors(); | |
$xpath = new DOMXPath($doc); | |
$postsQuery = "//div[contains(@class, 'post-outer')]"; | |
$nextPageQuery = "//a[@title='More posts']"; | |
foreach ($xpath->query($postsQuery) as $post) { | |
$titleNode = $xpath->query(".//h3[contains(@class, 'post-title')]/a", $post); | |
$title = $titleNode->item(0) ? $titleNode->item(0)->nodeValue : 'No title found'; | |
$postUrl = $titleNode->item(0) ? $titleNode->item(0)->getAttribute('href') : null; | |
// Fetch full post content from the post's page | |
$fullPostContent = fetch_full_post_content($postUrl); | |
// Extract excerpt from snippet-item | |
$excerptNode = $xpath->query(".//div[contains(@class, 'snippet-item')]", $post); | |
$excerpt = $excerptNode->item(0) ? $excerptNode->item(0)->nodeValue : ''; | |
$dateNode = $xpath->query(".//time[@class='published']", $post); | |
$date = $dateNode->item(0) ? $dateNode->item(0)->getAttribute('datetime') : date('Y-m-d H:i:s'); | |
$post_data = array( | |
'post_title' => sanitize_text_field($title), | |
'post_content' => wp_kses_post($fullPostContent), // Use the full content | |
'post_status' => 'publish', | |
'post_author' => 1, | |
'post_date' => date('Y-m-d H:i:s', strtotime($date)), | |
'post_category' => array(1), | |
'post_excerpt' => sanitize_text_field($excerpt) | |
); | |
$existing_post_id = post_exists_by_title($post_data['post_title']); | |
if ($existing_post_id) { | |
// Post exists, update it | |
$post_data['ID'] = $existing_post_id; // Set the ID to update the existing post | |
$post_id = wp_update_post($post_data); | |
echo "Updated existing post: " . $post_data['post_title'] . " (ID: $post_id)<br>"; | |
} else { | |
// Post does not exist, insert it | |
$post_id = wp_insert_post($post_data); | |
echo "Inserted new post: " . $post_data['post_title'] . " (ID: $post_id)<br>"; | |
} | |
// Extract image URL from JSON-LD script | |
$scriptNodes = $xpath->query(".//script[@type='application/ld+json']", $post); | |
$imageURL = ''; | |
foreach ($scriptNodes as $scriptNode) { | |
$jsonData = json_decode($scriptNode->nodeValue, true); | |
if (isset($jsonData['image']['url'])) { | |
$imageURL = $jsonData['image']['url']; | |
break; | |
} | |
} | |
// Set featured image | |
set_featured_image($post_id, $imageURL); | |
} | |
// Check for the next page | |
$nextPage = $xpath->query($nextPageQuery)->item(0); | |
if ($nextPage) { | |
$nextPageURL = $nextPage->getAttribute('href'); | |
fetch_posts($nextPageURL); // Recursively call fetch_posts with the new URL | |
} | |
} | |
// Start fetching from the first page | |
fetch_posts('https://coastregionpressclub.blogspot.com/'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment