Created
September 15, 2023 05:56
-
-
Save andyg2/bd0ab9ba78fbf9b7ef893e21191bd9b1 to your computer and use it in GitHub Desktop.
Download Wordpress Page content from wp-json API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
if (isset($_GET['host'])) { | |
define('WEBSITE_URL', 'https://' . $_GET['host']); | |
} else { | |
echo 'usage: get_pages.php?host=www.domain.com'; | |
exit; | |
} | |
// Create the API endpoint URL to fetch the list of pages with specific fields | |
$pages_api_url = WEBSITE_URL . '/wp-json/wp/v2/pages/?_fields=author,id,excerpt,status,title,link&per_page=100'; | |
$pages_result = []; | |
// Initialize cURL session to fetch the list of pages | |
$ch = curl_init($pages_api_url); | |
// Set cURL options | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
// Execute the cURL request to get the list of pages | |
$pages_response = curl_exec($ch); | |
// Check for cURL errors | |
if (curl_errno($ch)) { | |
echo 'cURL Error: ' . curl_error($ch); | |
} else { | |
// Decode the JSON response | |
$pages_data = json_decode($pages_response); | |
// Check if pages data is available | |
if ($pages_data && is_array($pages_data)) { | |
// Loop through each page and fetch its content | |
foreach ($pages_data as $page) { | |
// prex($page); | |
if (isset($page->title->rendered) && $page->status == 'publish') { | |
$title = $page_title = $page->title->rendered; | |
$slug = slugify($title); | |
$pages_json = cache_func('get_wp_page', $page); | |
if (isset($pages_json['status'])) { | |
$page_extracted = []; | |
$page_extracted['slug'] = $pages_json['slug']; | |
$page_extracted['title'] = $pages_json['title']['rendered']; | |
$page_extracted['content'] = $pages_json['content']['rendered']; | |
$page_extracted['excerpt'] = $pages_json['excerpt']['rendered']; | |
$page_extracted['template'] = $pages_json['template']; | |
$page_extracted['meta'] = $pages_json['meta']; | |
downloadImagesFromCss($page_extracted['content'], './images/' . $page_extracted['slug']); | |
downloadImagesFromHTML($page_extracted['content'], './images/' . $page_extracted['slug']); | |
$pages_result[$slug] = $page_extracted; | |
} | |
} | |
} | |
} else { | |
echo 'No pages found.'; | |
} | |
} | |
// Close cURL session for the list of pages | |
curl_close($ch); | |
file_put_contents('./' . slugify(WEBSITE_URL) . '-pages.json', json_encode($pages_result)); | |
// Function to locate and download images from page content | |
function downloadImagesFromCss($page_content, $output_directory) { | |
if (!is_dir($output_directory)) { | |
mkdir($output_directory, 0755, true); | |
} | |
// Define a regular expression pattern to match image URLs | |
$pattern = '/url\((.*?)\)/'; | |
// Find all image URLs in the content | |
if (preg_match_all($pattern, $page_content, $matches)) { | |
// Loop through the matched URLs | |
foreach ($matches[1] as $imageUrl) { | |
// Remove leading and trailing single or double quotes | |
$imageUrl = trim($imageUrl, "'\""); | |
// Create the image URL | |
$image_url = $imageUrl; | |
// prex($image_url); | |
// Generate a unique filename for the image | |
$image_filename = basename($image_url); | |
// Define the output path for the downloaded image | |
$output_path = $output_directory . '/' . $image_filename; | |
if (!file_exists($output_path)) { | |
// Download the image | |
// Initialize cURL session to download the image | |
$ch_image = curl_init($image_url); | |
// Set cURL options to save the image to the output path | |
$fp = fopen($output_path, 'wb'); | |
curl_setopt($ch_image, CURLOPT_FILE, $fp); | |
curl_setopt($ch_image, CURLOPT_HEADER, 0); | |
curl_exec($ch_image); | |
// Check for cURL errors | |
if (curl_errno($ch_image)) { | |
echo "cURL Error: " . curl_error($ch_image); | |
} else { | |
// echo "Downloaded image: {$image_url}<br>"; | |
} | |
// Close cURL session for the image | |
curl_close($ch_image); | |
fclose($fp); | |
} | |
} | |
} | |
} | |
// Function to locate and download images from HTML content | |
function downloadImagesFromHTML($html_content, $output_directory) { | |
if (!is_dir($output_directory)) { | |
mkdir($output_directory, 0755, true); | |
} | |
// Create a DOMDocument object to parse the HTML | |
$dom = new DOMDocument(); | |
@$dom->loadHTML($html_content); // Use @ to suppress warnings for invalid HTML | |
// Find all image tags in the HTML | |
$image_tags = $dom->getElementsByTagName('img'); | |
foreach ($image_tags as $image_tag) { | |
// Get the image source URL | |
$image_url = $image_tag->getAttribute('src'); | |
// Generate a unique filename for the image | |
$image_filename = basename($image_url); | |
// Define the output path for the downloaded image | |
$output_path = $output_directory . '/' . $image_filename; | |
if (!file_exists($output_path)) { | |
// Initialize cURL session to download the image | |
$ch_image = curl_init($image_url); | |
// Set cURL options to save the image to the output path | |
$fp = fopen($output_path, 'wb'); | |
curl_setopt($ch_image, CURLOPT_FILE, $fp); | |
curl_setopt($ch_image, CURLOPT_HEADER, 0); | |
curl_exec($ch_image); | |
// Check for cURL errors | |
if (curl_errno($ch_image)) { | |
echo "cURL Error: " . curl_error($ch_image); | |
} | |
// Close cURL session for the image | |
curl_close($ch_image); | |
fclose($fp); | |
} | |
} | |
} | |
function get_wp_page($page) { | |
$page_id = $page->id; | |
$page_title = $page->title->rendered; | |
// Create the API endpoint URL to fetch individual page content | |
$page_content_api_url = WEBSITE_URL . '/wp-json/wp/v2/pages/' . $page_id; | |
// Initialize cURL session to fetch page content | |
$ch_page = curl_init($page_content_api_url); | |
// Set cURL options | |
curl_setopt($ch_page, CURLOPT_RETURNTRANSFER, true); | |
// Execute the cURL request to get the page content | |
$page_content_response = curl_exec($ch_page); | |
// Check for cURL errors | |
if (curl_errno($ch_page)) { | |
echo 'cURL Error for page ' . $page_title . ': ' . curl_error($ch_page); | |
$page_content_data = null; | |
} else { | |
// Decode the JSON response | |
$page_content_data = json_decode($page_content_response, true); | |
} | |
// Close cURL session for individual page | |
curl_close($ch_page); | |
return $page_content_data; | |
} | |
/** | |
* Caches the output of any function for a given amount of time | |
* | |
* @param string function name $func | |
* @param array of function arguments $args | |
* @param integer number of seconds to cache the function response $seconds 604800 (one week) | |
* @param boolean whether to treat the output of the function as JSON $json true | |
* @param string relative or path of cached function results (with trailing slash) $cache_dir ./cache | |
* @return void | |
*/ | |
function cache_func($func, $args, $seconds = 604800, $json = true, $cache_dir = './cache/') { | |
// Caches for x seconds ($seconds) the result ($result) of any function ($func) in directory ($cache_dir) | |
// create dir if not exists | |
if (!is_dir($cache_dir)) { | |
mkdir($cache_dir, 0755, true); | |
} | |
// generate simple hash of function name and arguments | |
// file deepcode ignore InsecureHash: Hash only used for filename creation | |
$request_hash = md5(json_encode(array($func, $args))); | |
// e.g: ./cache/expensive_function-1234567890abcdef1234567890abcdef.json | |
$request_file = $cache_dir . $func . '-' . $request_hash . '.' . ($json ? 'json' : 'txt'); | |
// check cache | |
$run = !file_exists($request_file) || (file_exists($request_file) && filemtime($request_file) < time() - $seconds); | |
if ($run) { // call the function | |
$result = $func($args); // calls the expensive_function with arguments | |
$json ? file_put_contents($request_file, json_encode($result)) : file_put_contents($request_file, $result); | |
} else { // use the cache | |
$result = $json ? json_decode(file_get_contents($request_file), true) : file_get_contents($request_file); | |
} | |
return ($result); | |
} | |
/** | |
* Create a slug from a string | |
* | |
* @param string input string to slugify $string | |
* @param mixed single or array of strings to remove from the output string $replace | |
* @param string string to separate each word $delimiter [-] | |
* @return string a slugified representation of the input string | |
* | |
* Example | |
* $string = Hello World, It's me! | |
* $replace = world | |
* return: 'hello-its-me' | |
*/ | |
function slugify($string, $delimiter = '-', $skip = '', $replace = []) { | |
$oldLocale = setlocale(LC_ALL, '0'); | |
setlocale(LC_ALL, 'en_US.UTF-8'); | |
$clean = iconv('UTF-8', 'ASCII//TRANSLIT', $string); | |
if (!empty($replace)) { | |
$clean = str_replace((array) $replace, ' ', $clean); | |
} | |
$regex = '^a-zA-Z0-9\/_|+ -'; | |
if (!empty($skip)) { | |
$regex .= $skip; | |
} | |
$clean = preg_replace("/[" . $regex . "]/", '', $clean); | |
$clean = strtolower($clean); | |
$clean = preg_replace("/[\/_|+ -]+/", $delimiter, $clean); | |
$clean = trim($clean, $delimiter); | |
setlocale(LC_ALL, $oldLocale); | |
return $clean; | |
} | |
/** | |
* Most used function of all time simply outputs a string or array wrapped in a <pre> tag | |
* | |
* @param array $a array or string that needs wrapping | |
* @param boolean $h show optional heading above the output | |
* @return void | |
*/ | |
function pre($a, $h = false) { | |
echo $h ? '<h3>' . $h . '</h3><pre>' : '<pre>'; | |
print_r($a); | |
echo '</pre>'; | |
} | |
/** | |
* Outputs a string or array wrapped in a <pre> tag and exists with optional debug_backtrace | |
* | |
* @param array $a | |
* @param boolean $h show optional heading above the output | |
* @param boolean $dbg perform backtrace | |
* @return void | |
*/ | |
function prex($a, $h = false, $dbg = false) { | |
pre($a, $h); | |
if ($dbg) { | |
echo '<pre>'; | |
print_r(debug_backtrace()); | |
echo '</pre>'; | |
} | |
exit; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment