-
-
Save raivivek/0eb22573d6ba4b0b6873889511118621 to your computer and use it in GitHub Desktop.
Pandoc filter to replace Link URLs with Wayback Machine URLs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
# -*- coding: utf-8 -*- | |
# Usage: pandoc --filter=wayback.py input | |
# Install pandocfilters and requests with pip before using | |
# Warning: may take a while to process input with lots of links | |
# Note: Links that can't be saved to WBM or already point to WBM are left as is | |
from pandocfilters import toJSONFilter, Link | |
import requests | |
base_url = 'http://web.archive.org' | |
def wayback(k, v, f, m): | |
''' Take a non-Wayback-Machine URL, save it to Wayback, replace with snapshot URL ''' | |
if k == 'Link': | |
attrs = v[1] | |
url = attrs[0] | |
if base_url in url: | |
return Link(v[0], attrs) | |
else: | |
r = requests.get(base_url + '/save/' + url) | |
s = r.status_code | |
new_url = base_url + r.headers['content-location'] if s == requests.codes.ok else url | |
return Link(v[0], (new_url, attrs[1])) | |
if __name__ == "__main__": | |
toJSONFilter(wayback) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment