CodeBrauer · August 15, 2018 13:34 · CodeBrauer · Dec 26, 2016 · CodeBrauer · Dec 26, 2016
diff --git a/composer.json b/composer.json
 {
    "require": {
        "fabpot/goutte": "^3.2",
        "digitalnature/php-ref": "^1.2"
    }
 }
diff --git a/index.php b/index.php
 <?php

 require_once 'vendor/autoload.php';

 use Goutte\Client;

 $url = "http://medoo.in/";

 /**
 * get first snapshot url from archive.org
 * @param  string  $url  url of the archived website
 * @param  integer $year year of the first snapshot (default: 1996)
 * @return string        url
 * @author Leksat <http://stackoverflow.com/a/11699301/1990745>
 */
 function getFirstSnapshot($url, $year = 1996) {
    $waybackurl = "https://web.archive.org/web/$year/$url"; // < redirects to first snapshot
    $ch = curl_init($waybackurl);
    curl_setopt_array($ch, [
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_HEADER         => true,
        CURLOPT_FOLLOWLOCATION => false,
        ]);
    $response = curl_exec($ch);
    preg_match_all('/^Location:(.*)$/mi', $response, $matches);
    curl_close($ch);
    return !empty($matches[1]) ? trim($matches[1][0]) : false;
 }

 $first_url = getFirstSnapshot($url);
 if ($first_url) {
    preg_match('/\d{4}+/', $first_url, $firstFoundYear); // find year
    $firstFoundYear = (int)$firstFoundYear[0];
 } else {
    die('Could not find first snapshot');
 }


 $foundUrls = [];
 foreach (range($firstFoundYear, (int)date('Y')) as $year) {
    echo ">> Year:" . $year . PHP_EOL;
    $client = new Client();
    $crawler = $client->request('GET', "https://web.archive.org/web/$year*/$url"); // '*' => show calendar
    $maxpage = $crawler->filter('.day a[href^="/web/"][href$="'.$url.'"]')->each(function ($node) use ($foundUrls) {
        echo $foundUrls[] = $node->attr('href');
        echo PHP_EOL;
    });
 }
	{
	"require": {
	"fabpot/goutte": "^3.2",
	"digitalnature/php-ref": "^1.2"
	}
	}
	<?php

	require_once 'vendor/autoload.php';

	use Goutte\Client;

	$url = "http://medoo.in/";

	/**
	* get first snapshot url from archive.org
	* @param string $url url of the archived website
	* @param integer $year year of the first snapshot (default: 1996)
	* @return string url
	* @author Leksat <http://stackoverflow.com/a/11699301/1990745>
	*/
	function getFirstSnapshot($url, $year = 1996) {
	$waybackurl = "https://web.archive.org/web/$year/$url"; // < redirects to first snapshot
	$ch = curl_init($waybackurl);
	curl_setopt_array($ch, [
	CURLOPT_RETURNTRANSFER => true,
	CURLOPT_HEADER => true,
	CURLOPT_FOLLOWLOCATION => false,
	]);
	$response = curl_exec($ch);
	preg_match_all('/^Location:(.*)$/mi', $response, $matches);
	curl_close($ch);
	return !empty($matches[1]) ? trim($matches[1][0]) : false;
	}

	$first_url = getFirstSnapshot($url);
	if ($first_url) {
	preg_match('/\d{4}+/', $first_url, $firstFoundYear); // find year
	$firstFoundYear = (int)$firstFoundYear[0];
	} else {
	die('Could not find first snapshot');
	}


	$foundUrls = [];
	foreach (range($firstFoundYear, (int)date('Y')) as $year) {
	echo ">> Year:" . $year . PHP_EOL;
	$client = new Client();
	$crawler = $client->request('GET', "https://web.archive.org/web/$year/$url"); // '' => show calendar
	$maxpage = $crawler->filter('.day a[href^="/web/"][href$="'.$url.'"]')->each(function ($node) use ($foundUrls) {
	echo $foundUrls[] = $node->attr('href');
	echo PHP_EOL;
	});
	}