Last active
January 16, 2021 22:55
-
-
Save wcaleb/dcd769a64fa2773f2c3b to your computer and use it in GitHub Desktop.
Pandoc filter to replace Link URLs with Wayback Machine URLs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
# -*- coding: utf-8 -*- | |
# Usage: pandoc --filter=wayback.py input | |
# Install pandocfilters and requests with pip before using | |
# Warning: may take a while to process input with lots of links | |
# Note: Links that can't be saved to WBM or already point to WBM are left as is | |
from pandocfilters import toJSONFilter, Link | |
import requests | |
base_url = 'http://web.archive.org' | |
def wayback(k, v, f, m): | |
''' Take a non-Wayback-Machine URL, save it to Wayback, replace with snapshot URL ''' | |
if k == 'Link': | |
attrs = v[1] | |
url = attrs[0] | |
if base_url in url: | |
return Link(v[0], attrs) | |
else: | |
r = requests.get(base_url + '/save/' + url) | |
s = r.status_code | |
new_url = base_url + r.headers['content-location'] if s == requests.codes.ok else url | |
return Link(v[0], (new_url, attrs[1])) | |
if __name__ == "__main__": | |
toJSONFilter(wayback) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Seems like this should maintain a persistent cache to prevent updating wayback on every rebuild.