Created
July 16, 2025 15:20
-
-
Save wolffe/836bef3583d6cce06b4cb81644631c9e to your computer and use it in GitHub Desktop.
A Houzez scraper plugin for WordPress
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Plugin Name: Houzez Property Scraper | |
* Plugin URI: https://www.4property.com/ | |
* Description: A Houzez scraper plugin for WordPress. | |
* Version: 1.0.0 | |
* Author: Ciprian Popescu | |
* Author URI: https://www.4property.com/ | |
* License: GNU General Public License v3 or later | |
* License URI: https://www.gnu.org/licenses/gpl-3.0.html | |
*/ | |
function fetch_html($url) { | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_URL, $url); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($ch, CURLOPT_HEADER, false); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
$response = curl_exec($ch); | |
if (curl_errno($ch)) { | |
echo 'cURL Error: ' . curl_error($ch); | |
curl_close($ch); | |
return false; | |
} | |
curl_close($ch); | |
return $response; | |
} | |
function scrape_data($html) { | |
$dom = new DOMDocument(); | |
libxml_use_internal_errors(true); | |
$dom->loadHTML($html, LIBXML_NOERROR); | |
libxml_use_internal_errors(false); | |
$xpath = new DOMXPath($dom); | |
$propIdNode = $xpath->query('//li[contains(@class, "item-tool houzez-print")]'); | |
$propertyId = ($propIdNode->length > 0) ? $propIdNode->item(0)->getAttribute('data-propid') : 0; | |
$dateNode = $xpath->query('//span[contains(@class, "small-text grey")]'); | |
$date = ($dateNode->length > 0) ? trim($dateNode->item(0)->textContent) : date( 'Y-m-d' ); | |
$datePattern = '/Updated on ([A-Za-z]+ \d{1,2}, \d{4})/'; | |
preg_match($datePattern, $date, $matches); | |
$formattedDate = isset($matches[1]) ? date('Y-m-d', strtotime($matches[1])) : date( 'Y-m-d' ); | |
$titleNode = $xpath->query('//div[@class="page-title"]/h1'); | |
$title = ($titleNode->length > 0) ? trim($titleNode->item(0)->textContent) : 'Title not found'; | |
$statusNode = $xpath->query('//a[contains(@class, "label-status")]'); | |
$status = ($statusNode->length > 0) ? trim($statusNode->item(0)->textContent) : ''; | |
$propertyTypeNode = $xpath->query('//div[@class="d-flex property-overview-data"]//ul[1]/li[2]/strong'); | |
$propertyType = ($propertyTypeNode->length > 0) ? trim($propertyTypeNode->item(0)->textContent) : ''; | |
$bedroomsNode = $xpath->query('//div[@class="d-flex property-overview-data"]//ul[2]/li[1]/strong'); | |
$bedrooms = ($bedroomsNode->length > 0) ? trim($bedroomsNode->item(0)->textContent) : 0; | |
$bathroomsNode = $xpath->query('//div[@class="d-flex property-overview-data"]//ul[3]/li[1]/strong'); | |
$bathrooms = ($bathroomsNode->length > 0) ? trim($bathroomsNode->item(0)->textContent) : 0; | |
$garageNode = $xpath->query('//div[@class="d-flex property-overview-data"]//ul[4]/li[1]/strong'); | |
$garage = ($garageNode->length > 0) ? trim($garageNode->item(0)->textContent) : 0; | |
$sizeNode = $xpath->query('//div[@class="d-flex property-overview-data"]//ul[5]/li[1]/strong'); | |
$size = ($sizeNode->length > 0) ? trim($sizeNode->item(0)->textContent) : ''; | |
$priceNode = $xpath->query('//ul[contains(@class, "item-price-wrap")]//li[@class="item-price"]'); | |
if ($priceNode->length > 0) { | |
$rawPrice = $priceNode->item(0)->textContent; | |
$price = preg_replace('/[^\d]/', '', $rawPrice); | |
} else { | |
$price = ''; | |
} | |
$descriptionNode = $xpath->query('//div[@id="property-description-wrap"]//div[@class="block-content-wrap"]'); | |
if ($descriptionNode->length > 0) { | |
$description = $dom->saveHTML($descriptionNode->item(0)); | |
} else { | |
$description = ''; | |
} | |
$cityNode = $xpath->query('//div[@id="property-address-wrap"]//li[@class="detail-city"]/span'); | |
$city = ($cityNode->length > 0) ? trim($cityNode->item(0)->textContent) : ''; | |
$countyNode = $xpath->query('//div[@id="property-address-wrap"]//li[@class="detail-state"]/span'); | |
$county = ($countyNode->length > 0) ? trim($countyNode->item(0)->textContent) : ''; | |
$imgLinks = []; | |
$anchors = $xpath->query("//a[contains(@class, 'img-wrap-1')]"); | |
foreach ($anchors as $anchor) { | |
$href = $anchor->getAttribute('href'); | |
if (!empty($href)) { | |
$imgLinks[] = str_replace('-758x564', '', $href); | |
} | |
} | |
// Import as properties into WP Property Drive | |
/**/ | |
$property_data = [ | |
'post_author' => 2, | |
'post_content' => $description, | |
'post_excerpt' => substr( sanitize_text_field( $description ), 0, 160 ), | |
'post_title' => $title, | |
'post_status' => 'publish', | |
'post_type' => 'property', | |
'ping_status' => 'closed', | |
'comment_status' => 'closed', | |
]; | |
$post_id = wp_insert_post( $property_data ); | |
// Save the Unique ID to the new post for comparison | |
update_post_meta( $post_id, 'importer_id', $propertyId ); | |
update_post_meta( $post_id, 'importer_api_key', '' ); | |
update_post_meta( $post_id, 'source', 'Houzez (Scraper)' ); | |
update_post_meta( $post_id, 'date_created', $formattedDate ); | |
update_post_meta( $post_id, 'date_modified', $formattedDate ); | |
update_post_meta( $post_id, 'full_address', $title ); | |
update_post_meta( $post_id, 'address_only', $title ); | |
update_post_meta( $post_id, 'property_status', $status ); | |
update_post_meta( $post_id, 'county', $county ); | |
update_post_meta( $post_id, 'area', $city ); | |
update_post_meta( $post_id, 'country', 'Ireland' ); | |
update_post_meta( $post_id, 'price', $price ); | |
update_post_meta( $post_id, 'property_size', $size ); | |
update_post_meta( $post_id, 'bedrooms', $bedrooms ); | |
update_post_meta( $post_id, 'bathrooms', $bathrooms ); | |
update_post_meta( $post_id, 'agent_name', 'Joe Naughton Auctioneers' ); | |
update_post_meta( $post_id, 'agent_number', '0906 449090' ); | |
update_post_meta( $post_id, 'agent_email', '[email protected]' ); | |
update_post_meta( $post_id, 'property_lock', 1 ); | |
update_post_meta( $post_id, 'international', 0 ); | |
update_post_meta( $post_id, 'list_reference', $propertyId ); | |
if ( (string) $status !== '' ) { | |
$status_array = [ | |
'for-sale' => 1, | |
'for-sale-to-let' => 1, | |
'to-let' => 1, | |
'for-auction' => 1, | |
'coming-soon' => 1, | |
'under-offer' => 2, | |
'open-to-offers' => 2, | |
'reserved' => 2, | |
'seeking' => 4, | |
'let-agreed' => 5, | |
'sale-agreed' => 5, | |
'has-been-let' => 6, | |
'let' => 6, | |
'sold' => 6, | |
]; | |
$property_order = $status_array[ sanitize_title( $status ) ]; | |
update_post_meta( $post_id, 'property_order', $property_order ); | |
} else { | |
update_post_meta( $post_id, 'property_order', 1 ); | |
} | |
$result = get_property_type_and_category( $propertyType ); | |
update_post_meta( $post_id, 'property_market', $result['category'] ); | |
update_post_meta( $post_id, 'property_type', $result['primaryType'] ); | |
upload_images_and_set_featured( $imgLinks, $post_id ); | |
// | |
if ( (string) $result['primaryType'] !== '' ) { | |
$type_name = sanitize_text_field( $result['primaryType'] ); | |
$type = term_exists( $type_name, 'property_type' ); | |
if ( ! ( $type === null ) ) { | |
wp_set_object_terms( $post_id, intval( $type['term_id'] ), 'property_type' ); | |
} else { | |
$type_created = wp_insert_term( $type_name, 'property_type' ); | |
wp_set_object_terms( $post_id, $type_created, 'property_type' ); | |
} | |
} | |
// Set the Property Area | |
if ( (string) $city !== '' ) { | |
$district_name = sanitize_text_field( $city ); | |
$area = term_exists( $district_name, 'property_area' ); | |
if ( ! ( $area === null ) ) { | |
wp_set_object_terms( $post_id, intval( $area['term_id'] ), 'property_area' ); | |
} else { | |
$area_created = wp_insert_term( $district_name, 'property_area' ); | |
wp_set_object_terms( $post_id, $area_created, 'property_area' ); | |
} | |
} | |
// Set the Property County | |
if ( (string) $county !== '' ) { | |
$county_city_name = sanitize_text_field( $county ); | |
$county = term_exists( $county_city_name, 'property_county' ); | |
if ( ! ( $county === null ) ) { | |
wp_set_object_terms( $post_id, intval( $county['term_id'] ), 'property_county' ); | |
} else { | |
$county_created = wp_insert_term( $county_city_name, 'property_county' ); | |
wp_set_object_terms( $post_id, $county_created, 'property_county' ); | |
} | |
} | |
/**/ | |
return [ | |
'propertyId' => $propertyId, | |
'post_id' => $post_id, | |
'date' => $formattedDate, | |
'title' => $title, | |
'status' => $status, | |
'propertyType' => $result['primaryType'], | |
'bedrooms' => $bedrooms, | |
'bathrooms' => $bathrooms, | |
'garage' => $garage, | |
'size' => $size, | |
'price' => $price, | |
'description' => $description, | |
'city' => $city, | |
'county' => $county, | |
'images' => $imgLinks, | |
]; | |
} | |
add_action('wp_ajax_fetch_property', 'handle_fetch_property'); | |
add_action('wp_ajax_nopriv_fetch_property', 'handle_fetch_property'); | |
function handle_fetch_property() { | |
check_ajax_referer('property_fetch_nonce', 'nonce'); | |
$url = sanitize_url($_POST['property_url']); | |
$html = fetch_html($url); | |
if ($html === false) { | |
wp_send_json_error(['message' => 'Failed to fetch URL']); | |
return; | |
} | |
$data = scrape_data($html); | |
wp_send_json_success($data); | |
} | |
function enqueue_scraper_scripts() { | |
wp_enqueue_script( | |
'property-scraper', | |
plugins_url('property-scraper.js', __FILE__), | |
[], | |
'1.0.0', | |
true | |
); | |
wp_localize_script( | |
'property-scraper', | |
'propertyScraperData', | |
[ | |
'ajaxUrl' => admin_url('admin-ajax.php'), | |
'nonce' => wp_create_nonce('property_fetch_nonce') | |
] | |
); | |
} | |
add_action('wp_enqueue_scripts', 'enqueue_scraper_scripts'); | |
function houzez_scraper_shortcode() { | |
$out = ''; | |
// Cleanup first | |
$args = [ | |
'fields' => 'ids', | |
'post_type' => 'property', | |
'post_status' => 'publish', | |
'posts_per_page' => -1, | |
'meta_query' => [ | |
[ | |
'key' => 'source', | |
'value' => [ 'Houzez (Scraper)' ], | |
'compare' => 'IN', | |
], | |
], | |
]; | |
$change_properties = get_posts( $args ); | |
foreach ( $change_properties as $change_property ) { | |
$out .= '<div>Deleting post ID ' . $change_property . '</div>'; | |
wp_delete_post( $change_property, true ); | |
} | |
// | |
// Go! | |
$sitemap_url = 'https://www.joenaughton.ie/property-sitemap.xml'; | |
$response = fetch_html($sitemap_url); | |
if (!$response) { | |
return 'Error fetching sitemap'; | |
} | |
$xml = simplexml_load_string($response); | |
if ($xml === false) { | |
return 'Error parsing XML'; | |
} | |
$urls = []; | |
foreach ($xml->url as $url) { | |
if ((string) $url->loc !== 'https://www.joenaughton.ie/property/') { | |
$urls[] = (string) $url->loc; | |
} | |
} | |
$urls = array_slice($urls, 0, 1); | |
$out .= '<div id="property-scraper-container">'; | |
$out .= '<div id="property-urls" data-urls="' . esc_attr(json_encode($urls)) . '"></div>'; | |
$out .= '<div id="progress-container">'; | |
$out .= '<div id="progress-bar" style="width: 0%; height: 20px; background-color: #4CAF50;"></div>'; | |
$out .= '<div id="progress-text">0% Complete</div>'; | |
$out .= '</div>'; | |
$out .= '<div id="properties-container"></div>'; | |
$out .= '</div>'; | |
return $out; | |
} | |
add_shortcode('houzez-properties', 'houzez_scraper_shortcode'); | |
function get_property_type($typeString) { | |
// Define the patterns for each category | |
$residentialKeywords = ['Residential', 'House', 'Apartment', 'Bungalow']; | |
$commercialKeywords = ['Commercial', 'Hotel', 'Bar', 'Industrial', 'Unit', 'Office', 'Retail']; | |
$landKeywords = ['Land', 'Site', 'Sites', 'Agricultural', 'Forestry']; | |
// Convert the type string to lower case for case-insensitive matching | |
$typeStringLower = strtolower($typeString); | |
// Check if "Residential", "Commercial", or "Land" exists | |
$primaryType = null; | |
// Check for Residential keywords and get the last segment before the keyword | |
foreach ($residentialKeywords as $keyword) { | |
if (stripos($typeString, $keyword) !== false) { | |
// Get the part before the keyword and trim it | |
$parts = explode(',', $typeString); | |
foreach ($parts as $part) { | |
if (stripos($part, $keyword) === false) { | |
$primaryType = trim($part); | |
break 2; // Exit both loops | |
} | |
} | |
} | |
} | |
// If not found in Residential, check for Commercial keywords | |
if (is_null($primaryType)) { | |
foreach ($commercialKeywords as $keyword) { | |
if (stripos($typeString, $keyword) !== false) { | |
$parts = explode(',', $typeString); | |
$primaryType = trim($parts[0]); // Get the first part as the primary type | |
break; | |
} | |
} | |
} | |
// If not found in Commercial, check for Land keywords | |
if (is_null($primaryType)) { | |
foreach ($landKeywords as $keyword) { | |
if (stripos($typeString, $keyword) !== false) { | |
$parts = explode(',', $typeString); | |
$primaryType = trim($parts[0]); // Get the first part as the primary type | |
break; | |
} | |
} | |
} | |
// If primary type is still not set, set defaults based on keywords | |
if (is_null($primaryType)) { | |
if (preg_match('/\b(house|apartment|bungalow)\b/i', $typeString)) { | |
$parts = explode(',', $typeString); | |
$primaryType = trim($parts[0]); // return the primary type | |
} elseif (preg_match('/\b(site|sites|agricultural|forestry)\b/i', $typeString)) { | |
$parts = explode(',', $typeString); | |
$primaryType = trim($parts[0]); // return the primary type | |
} elseif (preg_match('/\b(hotel|bar|hotel\/bar|industrial|unit|office|retail)\b/i', $typeString)) { | |
$parts = explode(',', $typeString); | |
$primaryType = trim($parts[0]); // return the primary type | |
} else { | |
$primaryType = trim($typeString); // Fallback to original string | |
} | |
} | |
return $primaryType; | |
} | |
function get_property_type_and_category($typeString) { | |
// Define the keywords for each category | |
$residentialKeywords = ['Residential', 'House', 'Apartment', 'Bungalow']; | |
$commercialKeywords = ['Commercial', 'Hotel', 'Bar', 'Industrial', 'Unit', 'Office', 'Retail']; | |
$landKeywords = ['Land', 'Site', 'Sites', 'Agricultural', 'Forestry']; | |
// Initialize variables to hold primary type and category | |
$primaryType = null; | |
$category = null; | |
// Convert the type string to lower case for case-insensitive matching | |
$typeStringLower = strtolower($typeString); | |
// Check for the presence of each category | |
if (preg_match('/\bResidential\b/i', $typeString)) { | |
$category = 'Residential'; | |
} elseif (preg_match('/\bCommercial\b/i', $typeString)) { | |
$category = 'Commercial'; | |
} elseif (preg_match('/\bLand\b/i', $typeString)) { | |
$category = 'Land'; | |
} else { | |
// If none found, check for keywords to determine category | |
if (preg_match('/\b(house|apartment|bungalow)\b/i', $typeString)) { | |
$category = 'Residential'; | |
} elseif (preg_match('/\b(site|sites|agricultural|forestry)\b/i', $typeString)) { | |
$category = 'Land'; | |
} elseif (preg_match('/\b(hotel|bar|hotel\/bar|industrial|unit|office|retail)\b/i', $typeString)) { | |
$category = 'Commercial'; | |
} | |
} | |
// Determine the primary type based on the category | |
if ($category === 'Residential') { | |
// Get the primary type for Residential | |
$parts = explode(',', $typeString); | |
foreach ($parts as $part) { | |
if (stripos($part, 'Residential') === false) { | |
$primaryType = trim($part); | |
break; | |
} | |
} | |
} elseif ($category === 'Commercial' || $category === 'Land') { | |
// Get the primary type for Commercial and Land | |
$parts = explode(',', $typeString); | |
$primaryType = trim($parts[0]); | |
} | |
// Return the primary type and category | |
return [ | |
'primaryType' => $primaryType, | |
'category' => $category, | |
]; | |
} | |
function upload_images_and_set_featured($imgLinks, $post_id) { | |
// Check if the $imgLinks array is not empty | |
if (empty($imgLinks) || !is_array($imgLinks)) { | |
return; | |
} | |
// Initialize an array to store uploaded image URLs | |
$uploaded_image_urls = []; | |
$attachments = []; // Store attachment IDs | |
$counter = 0; // Initialize the counter for ordering | |
foreach ($imgLinks as $imgLink) { | |
// Upload the image | |
$attachment_id = media_sideload_image($imgLink, $post_id, null, 'id'); | |
// Check if the upload was successful | |
if (is_wp_error($attachment_id)) { | |
// Handle the error, e.g., log it or display a message | |
error_log('Image upload error: ' . $attachment_id->get_error_message()); | |
continue; // Skip this image and continue with the next one | |
} | |
// Set the menu order (based on the counter) | |
wp_update_post([ | |
'ID' => $attachment_id, | |
'menu_order' => $counter // Set the menu order for the attachment | |
]); | |
// Increment the counter | |
$counter++; | |
// Add attachment ID to the array | |
$attachments[] = $attachment_id; | |
// Get the URL of the uploaded image | |
$uploaded_image_urls[] = wp_get_attachment_url($attachment_id); | |
// Optionally, set the attachment metadata | |
$attach_data = wp_generate_attachment_metadata($attachment_id, get_attached_file($attachment_id)); | |
wp_update_attachment_metadata($attachment_id, $attach_data); | |
} | |
// Update custom meta field with uploaded image URLs | |
update_post_meta( $post_id, 'wppd_pics', $uploaded_image_urls ); | |
update_post_meta( $post_id, 'wppd_primary_image', $uploaded_image_urls[0] ); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
document.addEventListener('DOMContentLoaded', function () { | |
const urlsContainer = document.getElementById('property-urls'); | |
if (!urlsContainer) return; | |
const urls = JSON.parse(urlsContainer.dataset.urls); | |
const totalUrls = urls.length; | |
let processedUrls = 0; | |
function updateProgress() { | |
const percentage = (processedUrls / totalUrls) * 100; | |
document.getElementById('progress-bar').style.width = percentage + '%'; | |
document.getElementById('progress-text').textContent = | |
Math.round(percentage) + '% Complete (' + processedUrls + '/' + totalUrls + ')'; | |
} | |
async function fetchProperty(url) { | |
const formData = new FormData(); | |
formData.append('action', 'fetch_property'); | |
formData.append('property_url', url); | |
formData.append('nonce', propertyScraperData.nonce); | |
try { | |
const response = await fetch(propertyScraperData.ajaxUrl, { | |
method: 'POST', | |
body: formData, | |
credentials: 'same-origin' | |
}); | |
const data = await response.json(); | |
if (data.success) { | |
displayProperty(data.data); | |
} else { | |
console.error('Error fetching property:', data); | |
} | |
} catch (error) { | |
console.error('Error:', error); | |
} | |
processedUrls++; | |
updateProgress(); | |
// Process next URL if available | |
if (processedUrls < totalUrls) { | |
await fetchProperty(urls[processedUrls]); | |
} | |
} | |
function displayProperty(property) { | |
const container = document.getElementById('properties-container'); | |
const propertyDiv = document.createElement('div'); | |
propertyDiv.className = 'property-item'; | |
propertyDiv.innerHTML = ` | |
<details> | |
<summary>${property.title} (Click to open)</summary> | |
<h2>${property.title}</h2> | |
<div class="property-details"> | |
<p><strong>ID:</strong> ${property.propertyId}</p> | |
<p><strong>Post ID:</strong> ${property.post_id}</p> | |
<p><strong>Date:</strong> ${property.date}</p> | |
<p><strong>Status:</strong> ${property.status}</p> | |
<p><strong>Type:</strong> ${property.propertyType}</p> | |
<p><strong>Bedrooms:</strong> ${property.bedrooms}</p> | |
<p><strong>Bathrooms:</strong> ${property.bathrooms}</p> | |
<p><strong>Garage:</strong> ${property.garage}</p> | |
<p><strong>Size:</strong> ${property.size} Sq Ft</p> | |
<p><strong>Price:</strong> ${property.price}</p> | |
<p><strong>Location:</strong> ${property.city}, ${property.county}</p> | |
</div> | |
<div class="property-images"> | |
${property.images.map(img => `<img src="${img}" height="40" style="height:40px">`).join('')} | |
</div> | |
<details> | |
<summary>Description</summary> | |
<div class="property-description"> | |
${property.description} | |
</div> | |
</details> | |
</details> | |
`; | |
container.appendChild(propertyDiv); | |
} | |
// Start processing the first URL | |
if (urls.length > 0) { | |
fetchProperty(urls[0]); | |
} | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment