Created
April 30, 2009 23:48
-
-
Save stilist/104765 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby -w | |
# This script checks if a given GeoCities page has been saved by the Archive | |
# Team. Its goal is to reduce duplication of effort by only grabbing new pages. | |
# | |
# Input: plaintext list of URLs to check | |
# Output: plaintext list of URLs not in the cache | |
# | |
# MIT license, stilist, 02009 | |
require 'cgi' | |
require 'socket' | |
require 'timeout' | |
unless ARGV[0] && ARGV[1] | |
puts 'Usage: listvscache inputfile outputfile' | |
exit(-1) | |
end | |
unless File.exist?(ARGV[0]) && File.ftype(ARGV[0]) == 'file' | |
puts 'Please specify a valid input file' | |
exit(-1) | |
end | |
$inputFile = File.open(ARGV[0], 'r') | |
$outputFile = File.new(ARGV[1], 'w') | |
$baseURL = 'www.geneb.org' | |
$extURL = '/cgi-bin/at_dupecheck.cgi?sitemode=geocities&url=' | |
def dupeCheck(address) | |
retries = 5 | |
begin | |
address.gsub!('http://', '') | |
result = Timeout::timeout(1) { | |
s = TCPSocket.open($baseURL, 80) | |
s.write("GET #{$extURL}./#{address} HTTP/1.1\r\nHost: #{$baseURL}\r\n\r\n") | |
response = s.read[-1,1].to_sym # only need a single character | |
s.close | |
response | |
} | |
case result | |
when :N | |
puts "#{address} is not cached" | |
$outputFile.puts address | |
when :Y | |
puts "#{address} is cached" | |
when :X | |
puts "#{address} is invalid" | |
else | |
puts "Bad URL: #{address}" | |
end | |
rescue Timeout::Error | |
retries -= 1 | |
if retries > 0 | |
puts "Will retry #{address} (#{retries} attempts remain)" | |
sleep 0.001 and retry | |
else | |
puts "ERROR: Giving up on #{address}" | |
return true | |
end | |
end | |
end | |
def listVsCache | |
$inputFile.each_line do |address| | |
# clean things up a little | |
address = CGI.unescape(address) | |
address.gsub!(/(\s+|\.\.\/)/, '') # space or ../ | |
uncached = dupeCheck(address) | |
# sleep 0.001 # rate-limit to 1000/sec | |
end | |
$inputFile.close | |
$outputFile.close | |
end | |
listVsCache |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment