Skip to content

Instantly share code, notes, and snippets.

@andyg2
Created September 15, 2023 05:56
Show Gist options
  • Save andyg2/bd0ab9ba78fbf9b7ef893e21191bd9b1 to your computer and use it in GitHub Desktop.
Save andyg2/bd0ab9ba78fbf9b7ef893e21191bd9b1 to your computer and use it in GitHub Desktop.
Download Wordpress Page content from wp-json API
<?php
if (isset($_GET['host'])) {
define('WEBSITE_URL', 'https://' . $_GET['host']);
} else {
echo 'usage: get_pages.php?host=www.domain.com';
exit;
}
// Create the API endpoint URL to fetch the list of pages with specific fields
$pages_api_url = WEBSITE_URL . '/wp-json/wp/v2/pages/?_fields=author,id,excerpt,status,title,link&per_page=100';
$pages_result = [];
// Initialize cURL session to fetch the list of pages
$ch = curl_init($pages_api_url);
// Set cURL options
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
// Execute the cURL request to get the list of pages
$pages_response = curl_exec($ch);
// Check for cURL errors
if (curl_errno($ch)) {
echo 'cURL Error: ' . curl_error($ch);
} else {
// Decode the JSON response
$pages_data = json_decode($pages_response);
// Check if pages data is available
if ($pages_data && is_array($pages_data)) {
// Loop through each page and fetch its content
foreach ($pages_data as $page) {
// prex($page);
if (isset($page->title->rendered) && $page->status == 'publish') {
$title = $page_title = $page->title->rendered;
$slug = slugify($title);
$pages_json = cache_func('get_wp_page', $page);
if (isset($pages_json['status'])) {
$page_extracted = [];
$page_extracted['slug'] = $pages_json['slug'];
$page_extracted['title'] = $pages_json['title']['rendered'];
$page_extracted['content'] = $pages_json['content']['rendered'];
$page_extracted['excerpt'] = $pages_json['excerpt']['rendered'];
$page_extracted['template'] = $pages_json['template'];
$page_extracted['meta'] = $pages_json['meta'];
downloadImagesFromCss($page_extracted['content'], './images/' . $page_extracted['slug']);
downloadImagesFromHTML($page_extracted['content'], './images/' . $page_extracted['slug']);
$pages_result[$slug] = $page_extracted;
}
}
}
} else {
echo 'No pages found.';
}
}
// Close cURL session for the list of pages
curl_close($ch);
file_put_contents('./' . slugify(WEBSITE_URL) . '-pages.json', json_encode($pages_result));
// Function to locate and download images from page content
function downloadImagesFromCss($page_content, $output_directory) {
if (!is_dir($output_directory)) {
mkdir($output_directory, 0755, true);
}
// Define a regular expression pattern to match image URLs
$pattern = '/url\((.*?)\)/';
// Find all image URLs in the content
if (preg_match_all($pattern, $page_content, $matches)) {
// Loop through the matched URLs
foreach ($matches[1] as $imageUrl) {
// Remove leading and trailing single or double quotes
$imageUrl = trim($imageUrl, "'\"");
// Create the image URL
$image_url = $imageUrl;
// prex($image_url);
// Generate a unique filename for the image
$image_filename = basename($image_url);
// Define the output path for the downloaded image
$output_path = $output_directory . '/' . $image_filename;
if (!file_exists($output_path)) {
// Download the image
// Initialize cURL session to download the image
$ch_image = curl_init($image_url);
// Set cURL options to save the image to the output path
$fp = fopen($output_path, 'wb');
curl_setopt($ch_image, CURLOPT_FILE, $fp);
curl_setopt($ch_image, CURLOPT_HEADER, 0);
curl_exec($ch_image);
// Check for cURL errors
if (curl_errno($ch_image)) {
echo "cURL Error: " . curl_error($ch_image);
} else {
// echo "Downloaded image: {$image_url}<br>";
}
// Close cURL session for the image
curl_close($ch_image);
fclose($fp);
}
}
}
}
// Function to locate and download images from HTML content
function downloadImagesFromHTML($html_content, $output_directory) {
if (!is_dir($output_directory)) {
mkdir($output_directory, 0755, true);
}
// Create a DOMDocument object to parse the HTML
$dom = new DOMDocument();
@$dom->loadHTML($html_content); // Use @ to suppress warnings for invalid HTML
// Find all image tags in the HTML
$image_tags = $dom->getElementsByTagName('img');
foreach ($image_tags as $image_tag) {
// Get the image source URL
$image_url = $image_tag->getAttribute('src');
// Generate a unique filename for the image
$image_filename = basename($image_url);
// Define the output path for the downloaded image
$output_path = $output_directory . '/' . $image_filename;
if (!file_exists($output_path)) {
// Initialize cURL session to download the image
$ch_image = curl_init($image_url);
// Set cURL options to save the image to the output path
$fp = fopen($output_path, 'wb');
curl_setopt($ch_image, CURLOPT_FILE, $fp);
curl_setopt($ch_image, CURLOPT_HEADER, 0);
curl_exec($ch_image);
// Check for cURL errors
if (curl_errno($ch_image)) {
echo "cURL Error: " . curl_error($ch_image);
}
// Close cURL session for the image
curl_close($ch_image);
fclose($fp);
}
}
}
function get_wp_page($page) {
$page_id = $page->id;
$page_title = $page->title->rendered;
// Create the API endpoint URL to fetch individual page content
$page_content_api_url = WEBSITE_URL . '/wp-json/wp/v2/pages/' . $page_id;
// Initialize cURL session to fetch page content
$ch_page = curl_init($page_content_api_url);
// Set cURL options
curl_setopt($ch_page, CURLOPT_RETURNTRANSFER, true);
// Execute the cURL request to get the page content
$page_content_response = curl_exec($ch_page);
// Check for cURL errors
if (curl_errno($ch_page)) {
echo 'cURL Error for page ' . $page_title . ': ' . curl_error($ch_page);
$page_content_data = null;
} else {
// Decode the JSON response
$page_content_data = json_decode($page_content_response, true);
}
// Close cURL session for individual page
curl_close($ch_page);
return $page_content_data;
}
/**
* Caches the output of any function for a given amount of time
*
* @param string function name $func
* @param array of function arguments $args
* @param integer number of seconds to cache the function response $seconds 604800 (one week)
* @param boolean whether to treat the output of the function as JSON $json true
* @param string relative or path of cached function results (with trailing slash) $cache_dir ./cache
* @return void
*/
function cache_func($func, $args, $seconds = 604800, $json = true, $cache_dir = './cache/') {
// Caches for x seconds ($seconds) the result ($result) of any function ($func) in directory ($cache_dir)
// create dir if not exists
if (!is_dir($cache_dir)) {
mkdir($cache_dir, 0755, true);
}
// generate simple hash of function name and arguments
// file deepcode ignore InsecureHash: Hash only used for filename creation
$request_hash = md5(json_encode(array($func, $args)));
// e.g: ./cache/expensive_function-1234567890abcdef1234567890abcdef.json
$request_file = $cache_dir . $func . '-' . $request_hash . '.' . ($json ? 'json' : 'txt');
// check cache
$run = !file_exists($request_file) || (file_exists($request_file) && filemtime($request_file) < time() - $seconds);
if ($run) { // call the function
$result = $func($args); // calls the expensive_function with arguments
$json ? file_put_contents($request_file, json_encode($result)) : file_put_contents($request_file, $result);
} else { // use the cache
$result = $json ? json_decode(file_get_contents($request_file), true) : file_get_contents($request_file);
}
return ($result);
}
/**
* Create a slug from a string
*
* @param string input string to slugify $string
* @param mixed single or array of strings to remove from the output string $replace
* @param string string to separate each word $delimiter [-]
* @return string a slugified representation of the input string
*
* Example
* $string = Hello World, It's me!
* $replace = world
* return: 'hello-its-me'
*/
function slugify($string, $delimiter = '-', $skip = '', $replace = []) {
$oldLocale = setlocale(LC_ALL, '0');
setlocale(LC_ALL, 'en_US.UTF-8');
$clean = iconv('UTF-8', 'ASCII//TRANSLIT', $string);
if (!empty($replace)) {
$clean = str_replace((array) $replace, ' ', $clean);
}
$regex = '^a-zA-Z0-9\/_|+ -';
if (!empty($skip)) {
$regex .= $skip;
}
$clean = preg_replace("/[" . $regex . "]/", '', $clean);
$clean = strtolower($clean);
$clean = preg_replace("/[\/_|+ -]+/", $delimiter, $clean);
$clean = trim($clean, $delimiter);
setlocale(LC_ALL, $oldLocale);
return $clean;
}
/**
* Most used function of all time simply outputs a string or array wrapped in a <pre> tag
*
* @param array $a array or string that needs wrapping
* @param boolean $h show optional heading above the output
* @return void
*/
function pre($a, $h = false) {
echo $h ? '<h3>' . $h . '</h3><pre>' : '<pre>';
print_r($a);
echo '</pre>';
}
/**
* Outputs a string or array wrapped in a <pre> tag and exists with optional debug_backtrace
*
* @param array $a
* @param boolean $h show optional heading above the output
* @param boolean $dbg perform backtrace
* @return void
*/
function prex($a, $h = false, $dbg = false) {
pre($a, $h);
if ($dbg) {
echo '<pre>';
print_r(debug_backtrace());
echo '</pre>';
}
exit;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment