Last active
          August 15, 2018 13:34 
        
      - 
      
- 
        Save CodeBrauer/79be0dc44cfd29f8a708583e0d84e259 to your computer and use it in GitHub Desktop. 
    Get all snapshots from archive.org as list
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | { | |
| "require": { | |
| "fabpot/goutte": "^3.2", | |
| "digitalnature/php-ref": "^1.2" | |
| } | |
| } | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | <?php | |
| require_once 'vendor/autoload.php'; | |
| use Goutte\Client; | |
| $url = "http://medoo.in/"; | |
| /** | |
| * get first snapshot url from archive.org | |
| * @param string $url url of the archived website | |
| * @param integer $year year of the first snapshot (default: 1996) | |
| * @return string url | |
| * @author Leksat <http://stackoverflow.com/a/11699301/1990745> | |
| */ | |
| function getFirstSnapshot($url, $year = 1996) { | |
| $waybackurl = "https://web.archive.org/web/$year/$url"; // < redirects to first snapshot | |
| $ch = curl_init($waybackurl); | |
| curl_setopt_array($ch, [ | |
| CURLOPT_RETURNTRANSFER => true, | |
| CURLOPT_HEADER => true, | |
| CURLOPT_FOLLOWLOCATION => false, | |
| ]); | |
| $response = curl_exec($ch); | |
| preg_match_all('/^Location:(.*)$/mi', $response, $matches); | |
| curl_close($ch); | |
| return !empty($matches[1]) ? trim($matches[1][0]) : false; | |
| } | |
| $first_url = getFirstSnapshot($url); | |
| if ($first_url) { | |
| preg_match('/\d{4}+/', $first_url, $firstFoundYear); // find year | |
| $firstFoundYear = (int)$firstFoundYear[0]; | |
| } else { | |
| die('Could not find first snapshot'); | |
| } | |
| $foundUrls = []; | |
| foreach (range($firstFoundYear, (int)date('Y')) as $year) { | |
| echo ">> Year:" . $year . PHP_EOL; | |
| $client = new Client(); | |
| $crawler = $client->request('GET', "https://web.archive.org/web/$year*/$url"); // '*' => show calendar | |
| $maxpage = $crawler->filter('.day a[href^="/web/"][href$="'.$url.'"]')->each(function ($node) use ($foundUrls) { | |
| echo $foundUrls[] = $node->attr('href'); | |
| echo PHP_EOL; | |
| }); | |
| } | 
After getting this far, just added this small part:
foreach ($foundUrls as $furl) {
    preg_match('/\d+/', $furl, $timestamp);
    $timestamp = $timestamp[0];
    echo "wayback_machine_downloader $url -t $timestamp -d $timestamp" . PHP_EOL;
}Which could be just executed like this: php index.php | bash and I got all my stuff.
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment
  
            
Example out: