Last active
August 15, 2018 13:34
-
-
Save CodeBrauer/79be0dc44cfd29f8a708583e0d84e259 to your computer and use it in GitHub Desktop.
Get all snapshots from archive.org as list
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "require": { | |
| "fabpot/goutte": "^3.2", | |
| "digitalnature/php-ref": "^1.2" | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| require_once 'vendor/autoload.php'; | |
| use Goutte\Client; | |
| $url = "http://medoo.in/"; | |
| /** | |
| * get first snapshot url from archive.org | |
| * @param string $url url of the archived website | |
| * @param integer $year year of the first snapshot (default: 1996) | |
| * @return string url | |
| * @author Leksat <http://stackoverflow.com/a/11699301/1990745> | |
| */ | |
| function getFirstSnapshot($url, $year = 1996) { | |
| $waybackurl = "https://web.archive.org/web/$year/$url"; // < redirects to first snapshot | |
| $ch = curl_init($waybackurl); | |
| curl_setopt_array($ch, [ | |
| CURLOPT_RETURNTRANSFER => true, | |
| CURLOPT_HEADER => true, | |
| CURLOPT_FOLLOWLOCATION => false, | |
| ]); | |
| $response = curl_exec($ch); | |
| preg_match_all('/^Location:(.*)$/mi', $response, $matches); | |
| curl_close($ch); | |
| return !empty($matches[1]) ? trim($matches[1][0]) : false; | |
| } | |
| $first_url = getFirstSnapshot($url); | |
| if ($first_url) { | |
| preg_match('/\d{4}+/', $first_url, $firstFoundYear); // find year | |
| $firstFoundYear = (int)$firstFoundYear[0]; | |
| } else { | |
| die('Could not find first snapshot'); | |
| } | |
| $foundUrls = []; | |
| foreach (range($firstFoundYear, (int)date('Y')) as $year) { | |
| echo ">> Year:" . $year . PHP_EOL; | |
| $client = new Client(); | |
| $crawler = $client->request('GET', "https://web.archive.org/web/$year*/$url"); // '*' => show calendar | |
| $maxpage = $crawler->filter('.day a[href^="/web/"][href$="'.$url.'"]')->each(function ($node) use ($foundUrls) { | |
| echo $foundUrls[] = $node->attr('href'); | |
| echo PHP_EOL; | |
| }); | |
| } |
After getting this far, just added this small part:
foreach ($foundUrls as $furl) {
preg_match('/\d+/', $furl, $timestamp);
$timestamp = $timestamp[0];
echo "wayback_machine_downloader $url -t $timestamp -d $timestamp" . PHP_EOL;
}Which could be just executed like this: php index.php | bash and I got all my stuff.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example out: