Created
March 21, 2012 03:02
-
-
Save rajbot/2143953 to your computer and use it in GitHub Desktop.
wayback-python prototyping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Prototype of liveweb proxy. | |
""" | |
def get_recent_crawl_location(url): | |
"""Looks at redis to find the location of the recent crawl of the given URL. | |
""" | |
return redis.get(md5sum(url)) | |
def fetch(url): | |
"""Fetches a url from liveweb. | |
If the url is cralwed very recently, returns that instead of fetching again. | |
""" | |
location = get_recent_crawl_locaiton(url) | |
if location: | |
filename, offset, size = location.split() | |
content = read_file(filename, offset, size) | |
else: | |
# warcproxy is a script written by Kenji | |
# We need to modify it to update redis with offset info. | |
content = warcproxy.fetch(url) | |
return content |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""query module prototype | |
""" | |
def fetch_url(url, timestamp): | |
if url.endswith('*'): | |
return fetch_prefix_matches(url, timestamp) | |
elif timestamp is None: | |
return fetch_wayback_captures(url) | |
else: | |
return fetch_wayback_content(url, timestamp) | |
def fetch_prefix_matches(prefix_url, timestamp=None): | |
assert prefix_url.endswith('*') | |
match_list = query_urldb(prefix_url) | |
if match_list and timestamp: | |
match_list = filter_by_time(match_list, timestamp) | |
return match_list | |
def fetch_wayback_content(url, timestamp) | |
match_list = query_urldb(url) | |
closest_match = get_closest_capture(match_list, timestamp) | |
if closest_match is None: | |
return None | |
data = fetch_from_cluster(closest_match) | |
if data.content_type == 'text/html': | |
data = rewrite_page(data, timestamp) | |
return data | |
def fetch_wayback_captures(url): | |
"""returns a list of timestamps""" | |
match_list = query_urldb(url) | |
return get_timestamps(match_list) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Robots.txt cache. | |
Recently used robots.txt files are cached in redis. | |
""" | |
from robotparser import RobotFileParser | |
import query_urldb as urldb | |
import liveweb | |
def get_robotstxt(host): | |
"""Returns the contents of robots.txt file for given host from the wayback | |
machine. | |
""" | |
txt = redis.get(host) | |
if not txt: | |
url = "http://%s/robots.txt" % host | |
timestamp = "latest-timestamp-here" | |
txt = urldb.fetch_wayback_content(url, timestamp) or liveweb.fetch(url) | |
redis.set(host, txt) | |
return txt | |
def is_allowed(url): | |
"""Returns True if robots.txt allows waybackmachine to fetch/display the given URL. | |
""" | |
txt = get_robotstxt(get_host(url)) | |
parser = RobotFileParser() | |
parser.parse(txt.splitlines()) | |
return parser.can_fetch("ia_archiver", url) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Prototype of wayback webapp. | |
""" | |
import robots | |
import query_urldb as urldb | |
import liveweb | |
def index(): | |
return render_template("index.html") | |
def calendar(timestamp, url): | |
assert "*" in timestamp | |
if not robots.allowed(url): | |
return render_template("norobots.html", url) | |
data = urldb.query(url, timestamp) | |
if data: | |
return render_template("calendar.html", url, data) | |
else: | |
return render_template("not_archived.html", url) | |
def page(timestamp, url): | |
assert "*" not in timestamp | |
if not robots.allowed(url): | |
return render_template("norobots.html", url) | |
data = urldb.fetch_wayback_content(url, timestamp) or liveweb.fetch(url) | |
status, mimetype, body = parse_response(data) | |
# Handle non 200 status | |
if status != 200: | |
return render_template("non200.html", status, url) | |
# convert links and insert wayback header for html documents | |
if mimetype == "text/html": | |
body = rewrite_page(body, timestamp) | |
return body | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment