Last active
February 5, 2020 22:06
-
-
Save madeingnecca/cdae7df746897f73d02325fb44a85be9 to your computer and use it in GitHub Desktop.
Crawl a sitemap.xml file using php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
if (!isset($argv)) { | |
die("This script only works in cli mode.\n"); | |
} | |
if (!isset($argv[1])) { | |
die("Usage: " . $argv[0] . " <SITE_URL>\n"); | |
} | |
function http_request($url, $method = 'GET', $timeout = NULL) { | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_URL, $url); | |
if (isset($timeout)) { | |
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); | |
} | |
if (isset($method) && $method === 'HEAD') { | |
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'HEAD'); | |
curl_setopt($ch, CURLOPT_NOBODY, TRUE); | |
} | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE); | |
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); | |
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'); | |
$body = curl_exec($ch); | |
$code = curl_getinfo($ch, CURLINFO_RESPONSE_CODE); | |
curl_close($ch); | |
return array( | |
'code' => $code, | |
'body' => $body, | |
); | |
} | |
$sitemap_url = $argv[1] . '/sitemap.xml'; | |
fetch_sitemap($sitemap_url); | |
function fetch_sitemap($url, $sitemap_index = 0, $total_sitemaps = 1) { | |
echo "Visiting sitemap " . ($sitemap_index + 1) . "/" . $total_sitemaps . " at $url\n"; | |
$sitemap_response = http_request($url, 'GET'); | |
if ($sitemap_response['code'] != 200) { | |
die('GET ' . $sitemap_response['code'] . ' ' . $url); | |
} | |
$xml = new SimpleXMLElement($sitemap_response['body']); | |
$sitemaps = $xml->sitemap; | |
if (!empty($sitemaps)) { | |
$subsitemap_index = 0; | |
$subsitemap_count = count($sitemaps); | |
foreach ($sitemaps as $sitemap) { | |
fetch_sitemap((string) $sitemap->loc, $sitemap_index, $subsitemap_count); | |
$subsitemap_index++; | |
} | |
} | |
$urls = array(); | |
foreach ($xml->url as $url_node) { | |
$urls[] = (string) $url_node->loc; | |
} | |
sort($urls); | |
$method = 'HEAD'; | |
$url_index = 0; | |
$url_total = count($urls); | |
foreach ($urls as $url) { | |
$time_start = time(); | |
$response = http_request($url, $method, 10); | |
$time_end = time(); | |
echo ($url_index + 1) . '/' . $url_total . ' ' . $method . ' ' . $response['code'] . ' (' . ($time_end - $time_start) . 's) ' . $url . "\n"; | |
$url_index++; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment