Last active
August 24, 2023 20:05
-
-
Save cagrimmett/8d4bbbaecc841965818a4b12cb121da6 to your computer and use it in GitHub Desktop.
Script to check if broken links exist on the Wayback machine
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// Function to check if a URL exists in Wayback Machine | |
function checkWayback( $url ) { | |
$apiUrl = 'https://archive.org/wayback/available?url=' . urlencode( $url ); | |
$response = file_get_contents( $apiUrl ); | |
$json = json_decode( $response, true ); | |
if ( isset( $json['archived_snapshots']['closest']['url'] ) ) { | |
return $json['archived_snapshots']['closest']['url']; | |
} | |
return 'Not available'; | |
} | |
// Input and output file paths | |
$inputFilePath = 'broken_links.csv'; // This assumes your broken links are in the 2nd column of the spreadsheet. The source where the broken link was found is usually in the first column. | |
$outputFilePath = 'wayback.csv'; | |
// Open input and output CSV files | |
$inputFile = fopen( $inputFilePath, 'r' ); | |
$outputFile = fopen( $outputFilePath, 'w' ); | |
// Process input CSV file line by line | |
while ( ( $row = fgetcsv( $inputFile ) ) !== false ) { | |
$originalUrl = isset( $row[1] ) ? trim( $row[1] ) : ''; | |
$waybackUrl = checkWayback( $originalUrl ); | |
// Write the row to the output CSV file | |
$outputRow = array_merge( $row, array( $waybackUrl ) ); | |
echo "$originalUrl ---- $waybackUrl\n"; | |
fputcsv( $outputFile, $outputRow ); | |
} | |
// Close input and output files | |
fclose( $inputFile ); | |
fclose( $outputFile ); | |
echo 'CSV processing completed.'; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment