Skip to content

Instantly share code, notes, and snippets.

@anandology
Forked from rajbot/liveweb.py
Created March 21, 2012 03:02
Show Gist options
  • Save anandology/2143959 to your computer and use it in GitHub Desktop.
Save anandology/2143959 to your computer and use it in GitHub Desktop.
wayback-python prototyping
"""Prototype of liveweb proxy.
"""
def get_recent_crawl_location(url):
"""Looks at memcache to find the location of the recent crawl of the given URL.
"""
return memcache_client.get(md5sum(url))
def fetch(url):
"""Fetches a url from liveweb.
If the url is cralwed very recently, returns that instead of fetching again.
"""
location = get_recent_crawl_locaiton(url)
if location:
filename, offset, size = location.split()
content = read_file(filename, offset, size)
else:
# warcproxy is a script written by Kenji
# We need to modify it to update memcache with offset info.
content = warcproxy.fetch(url)
return content
def fetch_url(url, timestamp):
if url.endswith('*'):
return fetch_prefix_matches(url, timestamp)
else:
return fetch_wayback_content(url, timestamp)
def fetch_prefix_matches(prefix_url, timestamp=None):
assert prefix_url.endswith('*')
match_list = query_urldb(prefix_url)
if match_list and timestamp:
match_list = filter_by_time(match_list, timestamp)
return match_list
def fetch_wayback_content(url, timestamp):
match_list = query_urldb(url)
closest_match = get_closest_capture(match_list, timestamp)
if closest_match is None:
return None
return fetch_from_cluster(closest_match)
"""Robots.txt cache.
Recently used robots.txt files are cached in memcache.
"""
from robotparser import RobotFileParser
import query_urldb as urldb
import liveweb
def get_robotstxt(host):
"""Returns the contents of robots.txt file for given host from the wayback
machine.
"""
txt = memcache.get(host)
if not txt:
url = "http://%s/robots.txt" % host
timestamp = "latest-timestamp-here"
txt = urldb.fetch_wayback_content(url, timestamp) or liveweb.fetch(url)
memcache.put(host, txt)
return txt
def is_allowed(url):
"""Returns True if robots.txt allows waybackmachine to fetch/display the given URL.
"""
txt = get_robotstxt(get_host(url))
parser = RobotFileParser()
parser.parse(txt.splitlines())
return parser.can_fetch("ia_archiver", url)
"""Prototype of wayback webapp.
"""
import robots
import query_urldb as urldb
import liveweb
def index():
return render_template("index.html")
def calendar(timestamp, url):
assert "*" in timestamp
if not robots.allowed(url):
return render_template("norobots.html", url)
data = urldb.query(url, timestamp)
if data:
return render_template("calendar.html", url, data)
else:
return render_template("not_archived.html", url)
def page(timestamp, url):
assert "*" not in timestamp
if not robots.allowed(url):
return render_template("norobots.html", url)
data = urldb.fetch_wayback_content(url, timestamp) or liveweb.fetch(url)
status, mimetype, body = parse_response(data)
# Handle non 200 status
if status != 200:
return render_template("non200.html", status, url)
# convert links and insert wayback header for html documents
if mimetype == "text/html":
body = rewrite_page(body, timestamp)
return body
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment