-
-
Save anandology/2143959 to your computer and use it in GitHub Desktop.
wayback-python prototyping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Prototype of liveweb proxy. | |
""" | |
def get_recent_crawl_location(url): | |
"""Looks at memcache to find the location of the recent crawl of the given URL. | |
""" | |
return memcache_client.get(md5sum(url)) | |
def fetch(url): | |
"""Fetches a url from liveweb. | |
If the url is cralwed very recently, returns that instead of fetching again. | |
""" | |
location = get_recent_crawl_locaiton(url) | |
if location: | |
filename, offset, size = location.split() | |
content = read_file(filename, offset, size) | |
else: | |
# warcproxy is a script written by Kenji | |
# We need to modify it to update memcache with offset info. | |
content = warcproxy.fetch(url) | |
return content |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def fetch_url(url, timestamp): | |
if url.endswith('*'): | |
return fetch_prefix_matches(url, timestamp) | |
else: | |
return fetch_wayback_content(url, timestamp) | |
def fetch_prefix_matches(prefix_url, timestamp=None): | |
assert prefix_url.endswith('*') | |
match_list = query_urldb(prefix_url) | |
if match_list and timestamp: | |
match_list = filter_by_time(match_list, timestamp) | |
return match_list | |
def fetch_wayback_content(url, timestamp): | |
match_list = query_urldb(url) | |
closest_match = get_closest_capture(match_list, timestamp) | |
if closest_match is None: | |
return None | |
return fetch_from_cluster(closest_match) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Robots.txt cache. | |
Recently used robots.txt files are cached in memcache. | |
""" | |
from robotparser import RobotFileParser | |
import query_urldb as urldb | |
import liveweb | |
def get_robotstxt(host): | |
"""Returns the contents of robots.txt file for given host from the wayback | |
machine. | |
""" | |
txt = memcache.get(host) | |
if not txt: | |
url = "http://%s/robots.txt" % host | |
timestamp = "latest-timestamp-here" | |
txt = urldb.fetch_wayback_content(url, timestamp) or liveweb.fetch(url) | |
memcache.put(host, txt) | |
return txt | |
def is_allowed(url): | |
"""Returns True if robots.txt allows waybackmachine to fetch/display the given URL. | |
""" | |
txt = get_robotstxt(get_host(url)) | |
parser = RobotFileParser() | |
parser.parse(txt.splitlines()) | |
return parser.can_fetch("ia_archiver", url) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Prototype of wayback webapp. | |
""" | |
import robots | |
import query_urldb as urldb | |
import liveweb | |
def index(): | |
return render_template("index.html") | |
def calendar(timestamp, url): | |
assert "*" in timestamp | |
if not robots.allowed(url): | |
return render_template("norobots.html", url) | |
data = urldb.query(url, timestamp) | |
if data: | |
return render_template("calendar.html", url, data) | |
else: | |
return render_template("not_archived.html", url) | |
def page(timestamp, url): | |
assert "*" not in timestamp | |
if not robots.allowed(url): | |
return render_template("norobots.html", url) | |
data = urldb.fetch_wayback_content(url, timestamp) or liveweb.fetch(url) | |
status, mimetype, body = parse_response(data) | |
# Handle non 200 status | |
if status != 200: | |
return render_template("non200.html", status, url) | |
# convert links and insert wayback header for html documents | |
if mimetype == "text/html": | |
body = rewrite_page(body, timestamp) | |
return body | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment