Created
December 23, 2020 22:14
-
-
Save mikeytown2/cd8f571af70efe3ae9a5394024bc81dc to your computer and use it in GitHub Desktop.
get file list from a date given on https://web.archive.org/web/*/https://file.wikileaks.org/file/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
error_reporting(-1); | |
ini_set("display_errors", 1); | |
header('Content-Type: text/plain'); | |
set_time_limit(900); | |
$date = 'https://web.archive.org/web/20201221160810/'; | |
$starting_point = "{$date}https://file.wikileaks.org/file/"; | |
function escapefile_url($url){ | |
$parts = parse_url($url); | |
$path_parts = array_map('rawurldecode', explode('/', $parts['path'])); | |
return | |
$parts['scheme'] . '://' . | |
$parts['host'] . | |
implode('/', array_map('rawurlencode', $path_parts)) | |
; | |
} | |
function get_file_list($url) { | |
// wait 100ms. | |
usleep(100000); | |
$context = stream_context_create(array( | |
'http' => array( | |
'follow_location' => TRUE, | |
) | |
)); | |
$contents = '<html><body>' . stristr(file_get_contents(escapefile_url($url), FALSE, $context), '<!-- END WAYBACK TOOLBAR INSERT -->'); | |
$contents = stristr($contents, '</html>', TRUE) . '</html>'; | |
preg_match_all('/<a[^>]+href=([\'"])(?<href>.+?)\1[^>]*>/i', $contents, $result); | |
// $parsed_html = new SimpleXMLElement($contents); | |
unset($result['href'][0]); | |
return $result['href']; | |
} | |
function get_subfolders($urls, $current_folder) { | |
$list = array(); | |
foreach ($urls as $k => $url) { | |
$url = urldecode($url); | |
$list[] = "$current_folder$url"; | |
if (substr($url, -1) !== '/') { | |
continue; | |
} | |
if (stripos($url, 'https://') === 0 || stripos($url, 'http://') === 0 ) { | |
continue; | |
} | |
// if ($k < 11) { | |
if (strpos($url, '/') === 0) { | |
$parsed = parse_url($current_folder); | |
$list += get_subfolders(get_file_list("{$parsed['scheme']}://{$parsed['host']}{$url}"), $current_folder . $url); | |
} | |
else { | |
$list += get_subfolders(get_file_list("{$current_folder}{$url}"), $current_folder . $url); | |
} | |
// } | |
} | |
return $list; | |
} | |
function remove_archive_prefix($prefix, &$list) { | |
foreach ($list as &$url) { | |
$url = str_replace($prefix, '', $url); | |
} | |
} | |
$urls = get_file_list($starting_point); | |
$list = get_subfolders($urls, $starting_point); | |
remove_archive_prefix($date, $list); | |
print_r($list); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment