Last active
August 14, 2023 18:19
-
-
Save bjornjohansen/4905c93f9bd44e6084ec to your computer and use it in GitHub Desktop.
Basic sitemap crawler to warm up a full page cache
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/php | |
<?php | |
/** | |
* @license http://www.wtfpl.net/txt/copying/ WTFPL | |
*/ | |
date_default_timezone_set( 'UTC' ); | |
$sitemaps = array( | |
'https://bjornjohansen.no/sitemap_index.xml', | |
); | |
$crawler = new BJ_Crawler( $sitemaps ); | |
$crawler->run(); | |
/** | |
* Crawler class | |
*/ | |
class BJ_Crawler { | |
protected $_sitemaps = null; | |
protected $_urls = null; | |
/** | |
* Constructor | |
* | |
* @param array|string $sitemaps A string with an URL to a XML sitemap, or an array with URLs to XML sitemaps. Sitemap index files works well too. | |
* | |
*/ | |
function __construct( $sitemaps = null ) { | |
$this->_sitemaps = []; | |
$this->_urls = []; | |
if ( ! is_null( $sitemaps ) ) { | |
if ( ! is_array( $sitemaps ) ) { | |
$sitemaps = array( $sitemaps ); | |
} | |
foreach ( $sitemaps as $sitemap ) { | |
$this->add_sitemap( $sitemap ); | |
} | |
} | |
} | |
/** | |
* Add a sitemap URL to our crawl stack. Sitemap index files works too. | |
* | |
* @param string $sitemapurl URL to a XML sitemap or sitemap index | |
*/ | |
public function add_sitemap( $sitemapurl ) { | |
if ( in_array( $sitemapurl, $this->_sitemaps ) ) { | |
return; | |
} | |
$this->_sitemaps[] = $sitemapurl; | |
$ch = curl_init(); | |
curl_setopt( $ch, CURLOPT_URL, $sitemapurl ); | |
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true ); | |
$content = curl_exec( $ch ); | |
$http_return_code = curl_getinfo( $ch, CURLINFO_HTTP_CODE ); | |
if ( '200' != $http_return_code ) { | |
return false; | |
} | |
$xml = new SimpleXMLElement( $content, LIBXML_NOBLANKS ); | |
if ( ! $xml ) { | |
return false; | |
} | |
switch ( $xml->getName() ) { | |
case 'sitemapindex': | |
foreach ( $xml->sitemap as $sitemap ) { | |
$this->add_sitemap( reset( $sitemap->loc ) ); | |
} | |
break; | |
case 'urlset': | |
foreach ( $xml->url as $url ) { | |
$this->add_url( reset( $url->loc ) ); | |
} | |
break; | |
default: | |
break; | |
} | |
} | |
/** | |
* Add a URL to our crawl stack | |
* | |
* @param string $url URL to check | |
*/ | |
public function add_url( $url ) { | |
if ( ! in_array( $url, $this->_urls ) ) { | |
$this->_urls[] = $url; | |
} | |
} | |
/** | |
* Run the crawl | |
*/ | |
public function run() { | |
// Split our URLs into chunks of 5 URLs to use with curl multi | |
$chunks = array_chunk( $this->_urls, 5 ); | |
foreach ( $chunks as $chunk ) { | |
$mh = curl_multi_init(); | |
foreach ( $chunk as $url ) { | |
$ch = curl_init(); | |
curl_setopt( $ch, CURLOPT_URL, $url ); | |
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true ); | |
curl_multi_add_handle( $mh, $ch ); | |
} | |
$active = null; | |
do { | |
$mrc = curl_multi_exec( $mh, $active ); | |
} while ( CURLM_CALL_MULTI_PERFORM == $mrc ); | |
while ( $active && CURLM_OK == $mrc ) { | |
if ( curl_multi_select( $mh ) != -1) { | |
do { | |
$mrc = curl_multi_exec( $mh, $active ); | |
} while ( CURLM_CALL_MULTI_PERFORM == $mrc ); | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Wow, that's very nice of you @bjornjohansen – thanks!
Here we go: https://packagist.org/packages/eliashaeussler/cache-warmup 🎉