-
-
Save bjornjohansen/4905c93f9bd44e6084ec to your computer and use it in GitHub Desktop.
#!/usr/bin/php | |
<?php | |
/** | |
* @license http://www.wtfpl.net/txt/copying/ WTFPL | |
*/ | |
date_default_timezone_set( 'UTC' ); | |
$sitemaps = array( | |
'https://bjornjohansen.no/sitemap_index.xml', | |
); | |
$crawler = new BJ_Crawler( $sitemaps ); | |
$crawler->run(); | |
/** | |
* Crawler class | |
*/ | |
class BJ_Crawler { | |
protected $_sitemaps = null; | |
protected $_urls = null; | |
/** | |
* Constructor | |
* | |
* @param array|string $sitemaps A string with an URL to a XML sitemap, or an array with URLs to XML sitemaps. Sitemap index files works well too. | |
* | |
*/ | |
function __construct( $sitemaps = null ) { | |
$this->_sitemaps = []; | |
$this->_urls = []; | |
if ( ! is_null( $sitemaps ) ) { | |
if ( ! is_array( $sitemaps ) ) { | |
$sitemaps = array( $sitemaps ); | |
} | |
foreach ( $sitemaps as $sitemap ) { | |
$this->add_sitemap( $sitemap ); | |
} | |
} | |
} | |
/** | |
* Add a sitemap URL to our crawl stack. Sitemap index files works too. | |
* | |
* @param string $sitemapurl URL to a XML sitemap or sitemap index | |
*/ | |
public function add_sitemap( $sitemapurl ) { | |
if ( in_array( $sitemapurl, $this->_sitemaps ) ) { | |
return; | |
} | |
$this->_sitemaps[] = $sitemapurl; | |
$ch = curl_init(); | |
curl_setopt( $ch, CURLOPT_URL, $sitemapurl ); | |
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true ); | |
$content = curl_exec( $ch ); | |
$http_return_code = curl_getinfo( $ch, CURLINFO_HTTP_CODE ); | |
if ( '200' != $http_return_code ) { | |
return false; | |
} | |
$xml = new SimpleXMLElement( $content, LIBXML_NOBLANKS ); | |
if ( ! $xml ) { | |
return false; | |
} | |
switch ( $xml->getName() ) { | |
case 'sitemapindex': | |
foreach ( $xml->sitemap as $sitemap ) { | |
$this->add_sitemap( reset( $sitemap->loc ) ); | |
} | |
break; | |
case 'urlset': | |
foreach ( $xml->url as $url ) { | |
$this->add_url( reset( $url->loc ) ); | |
} | |
break; | |
default: | |
break; | |
} | |
} | |
/** | |
* Add a URL to our crawl stack | |
* | |
* @param string $url URL to check | |
*/ | |
public function add_url( $url ) { | |
if ( ! in_array( $url, $this->_urls ) ) { | |
$this->_urls[] = $url; | |
} | |
} | |
/** | |
* Run the crawl | |
*/ | |
public function run() { | |
// Split our URLs into chunks of 5 URLs to use with curl multi | |
$chunks = array_chunk( $this->_urls, 5 ); | |
foreach ( $chunks as $chunk ) { | |
$mh = curl_multi_init(); | |
foreach ( $chunk as $url ) { | |
$ch = curl_init(); | |
curl_setopt( $ch, CURLOPT_URL, $url ); | |
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true ); | |
curl_multi_add_handle( $mh, $ch ); | |
} | |
$active = null; | |
do { | |
$mrc = curl_multi_exec( $mh, $active ); | |
} while ( CURLM_CALL_MULTI_PERFORM == $mrc ); | |
while ( $active && CURLM_OK == $mrc ) { | |
if ( curl_multi_select( $mh ) != -1) { | |
do { | |
$mrc = curl_multi_exec( $mh, $active ); | |
} while ( CURLM_CALL_MULTI_PERFORM == $mrc ); | |
} | |
} | |
} | |
} | |
} |
@bjornjohansen I'm going to create a Composer package for cache warmup and would like to use parts of your code. Is this okay for you? Of course I'd add a source link to this gist file :)
@bjornjohansen I'm going to create a Composer package for cache warmup and would like to use parts of your code. Is this okay for you? Of course I'd add a source link to this gist file :)
Yes, of course @eliashaeussler
You don’t need to link back here. I’m hereby granting you a WTFPL license :-)
@bjornjohansen I'm going to create a Composer package for cache warmup and would like to use parts of your code. Is this okay for you? Of course I'd add a source link to this gist file :)
Yes, of course @eliashaeussler
You don’t need to link back here. I’m hereby granting you a WTFPL license :-)
Wow, that's very nice of you @bjornjohansen – thanks!
Here we go: https://packagist.org/packages/eliashaeussler/cache-warmup 🎉
Hello, is it work with wordpress version 5.4.2 and litespeed cache plugin? Please reply me, thank you.
Can I add my sub-sitemap?