Skip to content

Instantly share code, notes, and snippets.

@wolffe
Created July 16, 2025 15:20
Show Gist options
  • Save wolffe/836bef3583d6cce06b4cb81644631c9e to your computer and use it in GitHub Desktop.
Save wolffe/836bef3583d6cce06b4cb81644631c9e to your computer and use it in GitHub Desktop.
A Houzez scraper plugin for WordPress
<?php
/**
* Plugin Name: Houzez Property Scraper
* Plugin URI: https://www.4property.com/
* Description: A Houzez scraper plugin for WordPress.
* Version: 1.0.0
* Author: Ciprian Popescu
* Author URI: https://www.4property.com/
* License: GNU General Public License v3 or later
* License URI: https://www.gnu.org/licenses/gpl-3.0.html
*/
function fetch_html($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
$response = curl_exec($ch);
if (curl_errno($ch)) {
echo 'cURL Error: ' . curl_error($ch);
curl_close($ch);
return false;
}
curl_close($ch);
return $response;
}
function scrape_data($html) {
$dom = new DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML($html, LIBXML_NOERROR);
libxml_use_internal_errors(false);
$xpath = new DOMXPath($dom);
$propIdNode = $xpath->query('//li[contains(@class, "item-tool houzez-print")]');
$propertyId = ($propIdNode->length > 0) ? $propIdNode->item(0)->getAttribute('data-propid') : 0;
$dateNode = $xpath->query('//span[contains(@class, "small-text grey")]');
$date = ($dateNode->length > 0) ? trim($dateNode->item(0)->textContent) : date( 'Y-m-d' );
$datePattern = '/Updated on ([A-Za-z]+ \d{1,2}, \d{4})/';
preg_match($datePattern, $date, $matches);
$formattedDate = isset($matches[1]) ? date('Y-m-d', strtotime($matches[1])) : date( 'Y-m-d' );
$titleNode = $xpath->query('//div[@class="page-title"]/h1');
$title = ($titleNode->length > 0) ? trim($titleNode->item(0)->textContent) : 'Title not found';
$statusNode = $xpath->query('//a[contains(@class, "label-status")]');
$status = ($statusNode->length > 0) ? trim($statusNode->item(0)->textContent) : '';
$propertyTypeNode = $xpath->query('//div[@class="d-flex property-overview-data"]//ul[1]/li[2]/strong');
$propertyType = ($propertyTypeNode->length > 0) ? trim($propertyTypeNode->item(0)->textContent) : '';
$bedroomsNode = $xpath->query('//div[@class="d-flex property-overview-data"]//ul[2]/li[1]/strong');
$bedrooms = ($bedroomsNode->length > 0) ? trim($bedroomsNode->item(0)->textContent) : 0;
$bathroomsNode = $xpath->query('//div[@class="d-flex property-overview-data"]//ul[3]/li[1]/strong');
$bathrooms = ($bathroomsNode->length > 0) ? trim($bathroomsNode->item(0)->textContent) : 0;
$garageNode = $xpath->query('//div[@class="d-flex property-overview-data"]//ul[4]/li[1]/strong');
$garage = ($garageNode->length > 0) ? trim($garageNode->item(0)->textContent) : 0;
$sizeNode = $xpath->query('//div[@class="d-flex property-overview-data"]//ul[5]/li[1]/strong');
$size = ($sizeNode->length > 0) ? trim($sizeNode->item(0)->textContent) : '';
$priceNode = $xpath->query('//ul[contains(@class, "item-price-wrap")]//li[@class="item-price"]');
if ($priceNode->length > 0) {
$rawPrice = $priceNode->item(0)->textContent;
$price = preg_replace('/[^\d]/', '', $rawPrice);
} else {
$price = '';
}
$descriptionNode = $xpath->query('//div[@id="property-description-wrap"]//div[@class="block-content-wrap"]');
if ($descriptionNode->length > 0) {
$description = $dom->saveHTML($descriptionNode->item(0));
} else {
$description = '';
}
$cityNode = $xpath->query('//div[@id="property-address-wrap"]//li[@class="detail-city"]/span');
$city = ($cityNode->length > 0) ? trim($cityNode->item(0)->textContent) : '';
$countyNode = $xpath->query('//div[@id="property-address-wrap"]//li[@class="detail-state"]/span');
$county = ($countyNode->length > 0) ? trim($countyNode->item(0)->textContent) : '';
$imgLinks = [];
$anchors = $xpath->query("//a[contains(@class, 'img-wrap-1')]");
foreach ($anchors as $anchor) {
$href = $anchor->getAttribute('href');
if (!empty($href)) {
$imgLinks[] = str_replace('-758x564', '', $href);
}
}
// Import as properties into WP Property Drive
/**/
$property_data = [
'post_author' => 2,
'post_content' => $description,
'post_excerpt' => substr( sanitize_text_field( $description ), 0, 160 ),
'post_title' => $title,
'post_status' => 'publish',
'post_type' => 'property',
'ping_status' => 'closed',
'comment_status' => 'closed',
];
$post_id = wp_insert_post( $property_data );
// Save the Unique ID to the new post for comparison
update_post_meta( $post_id, 'importer_id', $propertyId );
update_post_meta( $post_id, 'importer_api_key', '' );
update_post_meta( $post_id, 'source', 'Houzez (Scraper)' );
update_post_meta( $post_id, 'date_created', $formattedDate );
update_post_meta( $post_id, 'date_modified', $formattedDate );
update_post_meta( $post_id, 'full_address', $title );
update_post_meta( $post_id, 'address_only', $title );
update_post_meta( $post_id, 'property_status', $status );
update_post_meta( $post_id, 'county', $county );
update_post_meta( $post_id, 'area', $city );
update_post_meta( $post_id, 'country', 'Ireland' );
update_post_meta( $post_id, 'price', $price );
update_post_meta( $post_id, 'property_size', $size );
update_post_meta( $post_id, 'bedrooms', $bedrooms );
update_post_meta( $post_id, 'bathrooms', $bathrooms );
update_post_meta( $post_id, 'agent_name', 'Joe Naughton Auctioneers' );
update_post_meta( $post_id, 'agent_number', '0906 449090' );
update_post_meta( $post_id, 'agent_email', '[email protected]' );
update_post_meta( $post_id, 'property_lock', 1 );
update_post_meta( $post_id, 'international', 0 );
update_post_meta( $post_id, 'list_reference', $propertyId );
if ( (string) $status !== '' ) {
$status_array = [
'for-sale' => 1,
'for-sale-to-let' => 1,
'to-let' => 1,
'for-auction' => 1,
'coming-soon' => 1,
'under-offer' => 2,
'open-to-offers' => 2,
'reserved' => 2,
'seeking' => 4,
'let-agreed' => 5,
'sale-agreed' => 5,
'has-been-let' => 6,
'let' => 6,
'sold' => 6,
];
$property_order = $status_array[ sanitize_title( $status ) ];
update_post_meta( $post_id, 'property_order', $property_order );
} else {
update_post_meta( $post_id, 'property_order', 1 );
}
$result = get_property_type_and_category( $propertyType );
update_post_meta( $post_id, 'property_market', $result['category'] );
update_post_meta( $post_id, 'property_type', $result['primaryType'] );
upload_images_and_set_featured( $imgLinks, $post_id );
//
if ( (string) $result['primaryType'] !== '' ) {
$type_name = sanitize_text_field( $result['primaryType'] );
$type = term_exists( $type_name, 'property_type' );
if ( ! ( $type === null ) ) {
wp_set_object_terms( $post_id, intval( $type['term_id'] ), 'property_type' );
} else {
$type_created = wp_insert_term( $type_name, 'property_type' );
wp_set_object_terms( $post_id, $type_created, 'property_type' );
}
}
// Set the Property Area
if ( (string) $city !== '' ) {
$district_name = sanitize_text_field( $city );
$area = term_exists( $district_name, 'property_area' );
if ( ! ( $area === null ) ) {
wp_set_object_terms( $post_id, intval( $area['term_id'] ), 'property_area' );
} else {
$area_created = wp_insert_term( $district_name, 'property_area' );
wp_set_object_terms( $post_id, $area_created, 'property_area' );
}
}
// Set the Property County
if ( (string) $county !== '' ) {
$county_city_name = sanitize_text_field( $county );
$county = term_exists( $county_city_name, 'property_county' );
if ( ! ( $county === null ) ) {
wp_set_object_terms( $post_id, intval( $county['term_id'] ), 'property_county' );
} else {
$county_created = wp_insert_term( $county_city_name, 'property_county' );
wp_set_object_terms( $post_id, $county_created, 'property_county' );
}
}
/**/
return [
'propertyId' => $propertyId,
'post_id' => $post_id,
'date' => $formattedDate,
'title' => $title,
'status' => $status,
'propertyType' => $result['primaryType'],
'bedrooms' => $bedrooms,
'bathrooms' => $bathrooms,
'garage' => $garage,
'size' => $size,
'price' => $price,
'description' => $description,
'city' => $city,
'county' => $county,
'images' => $imgLinks,
];
}
add_action('wp_ajax_fetch_property', 'handle_fetch_property');
add_action('wp_ajax_nopriv_fetch_property', 'handle_fetch_property');
function handle_fetch_property() {
check_ajax_referer('property_fetch_nonce', 'nonce');
$url = sanitize_url($_POST['property_url']);
$html = fetch_html($url);
if ($html === false) {
wp_send_json_error(['message' => 'Failed to fetch URL']);
return;
}
$data = scrape_data($html);
wp_send_json_success($data);
}
function enqueue_scraper_scripts() {
wp_enqueue_script(
'property-scraper',
plugins_url('property-scraper.js', __FILE__),
[],
'1.0.0',
true
);
wp_localize_script(
'property-scraper',
'propertyScraperData',
[
'ajaxUrl' => admin_url('admin-ajax.php'),
'nonce' => wp_create_nonce('property_fetch_nonce')
]
);
}
add_action('wp_enqueue_scripts', 'enqueue_scraper_scripts');
function houzez_scraper_shortcode() {
$out = '';
// Cleanup first
$args = [
'fields' => 'ids',
'post_type' => 'property',
'post_status' => 'publish',
'posts_per_page' => -1,
'meta_query' => [
[
'key' => 'source',
'value' => [ 'Houzez (Scraper)' ],
'compare' => 'IN',
],
],
];
$change_properties = get_posts( $args );
foreach ( $change_properties as $change_property ) {
$out .= '<div>Deleting post ID ' . $change_property . '</div>';
wp_delete_post( $change_property, true );
}
//
// Go!
$sitemap_url = 'https://www.joenaughton.ie/property-sitemap.xml';
$response = fetch_html($sitemap_url);
if (!$response) {
return 'Error fetching sitemap';
}
$xml = simplexml_load_string($response);
if ($xml === false) {
return 'Error parsing XML';
}
$urls = [];
foreach ($xml->url as $url) {
if ((string) $url->loc !== 'https://www.joenaughton.ie/property/') {
$urls[] = (string) $url->loc;
}
}
$urls = array_slice($urls, 0, 1);
$out .= '<div id="property-scraper-container">';
$out .= '<div id="property-urls" data-urls="' . esc_attr(json_encode($urls)) . '"></div>';
$out .= '<div id="progress-container">';
$out .= '<div id="progress-bar" style="width: 0%; height: 20px; background-color: #4CAF50;"></div>';
$out .= '<div id="progress-text">0% Complete</div>';
$out .= '</div>';
$out .= '<div id="properties-container"></div>';
$out .= '</div>';
return $out;
}
add_shortcode('houzez-properties', 'houzez_scraper_shortcode');
function get_property_type($typeString) {
// Define the patterns for each category
$residentialKeywords = ['Residential', 'House', 'Apartment', 'Bungalow'];
$commercialKeywords = ['Commercial', 'Hotel', 'Bar', 'Industrial', 'Unit', 'Office', 'Retail'];
$landKeywords = ['Land', 'Site', 'Sites', 'Agricultural', 'Forestry'];
// Convert the type string to lower case for case-insensitive matching
$typeStringLower = strtolower($typeString);
// Check if "Residential", "Commercial", or "Land" exists
$primaryType = null;
// Check for Residential keywords and get the last segment before the keyword
foreach ($residentialKeywords as $keyword) {
if (stripos($typeString, $keyword) !== false) {
// Get the part before the keyword and trim it
$parts = explode(',', $typeString);
foreach ($parts as $part) {
if (stripos($part, $keyword) === false) {
$primaryType = trim($part);
break 2; // Exit both loops
}
}
}
}
// If not found in Residential, check for Commercial keywords
if (is_null($primaryType)) {
foreach ($commercialKeywords as $keyword) {
if (stripos($typeString, $keyword) !== false) {
$parts = explode(',', $typeString);
$primaryType = trim($parts[0]); // Get the first part as the primary type
break;
}
}
}
// If not found in Commercial, check for Land keywords
if (is_null($primaryType)) {
foreach ($landKeywords as $keyword) {
if (stripos($typeString, $keyword) !== false) {
$parts = explode(',', $typeString);
$primaryType = trim($parts[0]); // Get the first part as the primary type
break;
}
}
}
// If primary type is still not set, set defaults based on keywords
if (is_null($primaryType)) {
if (preg_match('/\b(house|apartment|bungalow)\b/i', $typeString)) {
$parts = explode(',', $typeString);
$primaryType = trim($parts[0]); // return the primary type
} elseif (preg_match('/\b(site|sites|agricultural|forestry)\b/i', $typeString)) {
$parts = explode(',', $typeString);
$primaryType = trim($parts[0]); // return the primary type
} elseif (preg_match('/\b(hotel|bar|hotel\/bar|industrial|unit|office|retail)\b/i', $typeString)) {
$parts = explode(',', $typeString);
$primaryType = trim($parts[0]); // return the primary type
} else {
$primaryType = trim($typeString); // Fallback to original string
}
}
return $primaryType;
}
function get_property_type_and_category($typeString) {
// Define the keywords for each category
$residentialKeywords = ['Residential', 'House', 'Apartment', 'Bungalow'];
$commercialKeywords = ['Commercial', 'Hotel', 'Bar', 'Industrial', 'Unit', 'Office', 'Retail'];
$landKeywords = ['Land', 'Site', 'Sites', 'Agricultural', 'Forestry'];
// Initialize variables to hold primary type and category
$primaryType = null;
$category = null;
// Convert the type string to lower case for case-insensitive matching
$typeStringLower = strtolower($typeString);
// Check for the presence of each category
if (preg_match('/\bResidential\b/i', $typeString)) {
$category = 'Residential';
} elseif (preg_match('/\bCommercial\b/i', $typeString)) {
$category = 'Commercial';
} elseif (preg_match('/\bLand\b/i', $typeString)) {
$category = 'Land';
} else {
// If none found, check for keywords to determine category
if (preg_match('/\b(house|apartment|bungalow)\b/i', $typeString)) {
$category = 'Residential';
} elseif (preg_match('/\b(site|sites|agricultural|forestry)\b/i', $typeString)) {
$category = 'Land';
} elseif (preg_match('/\b(hotel|bar|hotel\/bar|industrial|unit|office|retail)\b/i', $typeString)) {
$category = 'Commercial';
}
}
// Determine the primary type based on the category
if ($category === 'Residential') {
// Get the primary type for Residential
$parts = explode(',', $typeString);
foreach ($parts as $part) {
if (stripos($part, 'Residential') === false) {
$primaryType = trim($part);
break;
}
}
} elseif ($category === 'Commercial' || $category === 'Land') {
// Get the primary type for Commercial and Land
$parts = explode(',', $typeString);
$primaryType = trim($parts[0]);
}
// Return the primary type and category
return [
'primaryType' => $primaryType,
'category' => $category,
];
}
function upload_images_and_set_featured($imgLinks, $post_id) {
// Check if the $imgLinks array is not empty
if (empty($imgLinks) || !is_array($imgLinks)) {
return;
}
// Initialize an array to store uploaded image URLs
$uploaded_image_urls = [];
$attachments = []; // Store attachment IDs
$counter = 0; // Initialize the counter for ordering
foreach ($imgLinks as $imgLink) {
// Upload the image
$attachment_id = media_sideload_image($imgLink, $post_id, null, 'id');
// Check if the upload was successful
if (is_wp_error($attachment_id)) {
// Handle the error, e.g., log it or display a message
error_log('Image upload error: ' . $attachment_id->get_error_message());
continue; // Skip this image and continue with the next one
}
// Set the menu order (based on the counter)
wp_update_post([
'ID' => $attachment_id,
'menu_order' => $counter // Set the menu order for the attachment
]);
// Increment the counter
$counter++;
// Add attachment ID to the array
$attachments[] = $attachment_id;
// Get the URL of the uploaded image
$uploaded_image_urls[] = wp_get_attachment_url($attachment_id);
// Optionally, set the attachment metadata
$attach_data = wp_generate_attachment_metadata($attachment_id, get_attached_file($attachment_id));
wp_update_attachment_metadata($attachment_id, $attach_data);
}
// Update custom meta field with uploaded image URLs
update_post_meta( $post_id, 'wppd_pics', $uploaded_image_urls );
update_post_meta( $post_id, 'wppd_primary_image', $uploaded_image_urls[0] );
}
document.addEventListener('DOMContentLoaded', function () {
const urlsContainer = document.getElementById('property-urls');
if (!urlsContainer) return;
const urls = JSON.parse(urlsContainer.dataset.urls);
const totalUrls = urls.length;
let processedUrls = 0;
function updateProgress() {
const percentage = (processedUrls / totalUrls) * 100;
document.getElementById('progress-bar').style.width = percentage + '%';
document.getElementById('progress-text').textContent =
Math.round(percentage) + '% Complete (' + processedUrls + '/' + totalUrls + ')';
}
async function fetchProperty(url) {
const formData = new FormData();
formData.append('action', 'fetch_property');
formData.append('property_url', url);
formData.append('nonce', propertyScraperData.nonce);
try {
const response = await fetch(propertyScraperData.ajaxUrl, {
method: 'POST',
body: formData,
credentials: 'same-origin'
});
const data = await response.json();
if (data.success) {
displayProperty(data.data);
} else {
console.error('Error fetching property:', data);
}
} catch (error) {
console.error('Error:', error);
}
processedUrls++;
updateProgress();
// Process next URL if available
if (processedUrls < totalUrls) {
await fetchProperty(urls[processedUrls]);
}
}
function displayProperty(property) {
const container = document.getElementById('properties-container');
const propertyDiv = document.createElement('div');
propertyDiv.className = 'property-item';
propertyDiv.innerHTML = `
<details>
<summary>${property.title} (Click to open)</summary>
<h2>${property.title}</h2>
<div class="property-details">
<p><strong>ID:</strong> ${property.propertyId}</p>
<p><strong>Post ID:</strong> ${property.post_id}</p>
<p><strong>Date:</strong> ${property.date}</p>
<p><strong>Status:</strong> ${property.status}</p>
<p><strong>Type:</strong> ${property.propertyType}</p>
<p><strong>Bedrooms:</strong> ${property.bedrooms}</p>
<p><strong>Bathrooms:</strong> ${property.bathrooms}</p>
<p><strong>Garage:</strong> ${property.garage}</p>
<p><strong>Size:</strong> ${property.size} Sq Ft</p>
<p><strong>Price:</strong> ${property.price}</p>
<p><strong>Location:</strong> ${property.city}, ${property.county}</p>
</div>
<div class="property-images">
${property.images.map(img => `<img src="${img}" height="40" style="height:40px">`).join('')}
</div>
<details>
<summary>Description</summary>
<div class="property-description">
${property.description}
</div>
</details>
</details>
`;
container.appendChild(propertyDiv);
}
// Start processing the first URL
if (urls.length > 0) {
fetchProperty(urls[0]);
}
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment