Skip to content

Instantly share code, notes, and snippets.

@relalis
Forked from charles-l/save_page.py
Created October 4, 2023 15:21
Show Gist options
  • Select an option

  • Save relalis/007e10a6346bf6e72d3feb7fff0dfc14 to your computer and use it in GitHub Desktop.

Select an option

Save relalis/007e10a6346bf6e72d3feb7fff0dfc14 to your computer and use it in GitHub Desktop.
A python script to save the Firefox Reader view of a page with images. Kind of a personal archive.org tool but using zip and HTML files rather than WARC.
#!/usr/bin/env python3
from bs4 import BeautifulSoup
from readability import Document
import click
from click import echo
import requests
import slugify
import os
import os.path
import urllib.parse
import tempfile
import zipfile
import datetime
@click.command()
@click.argument('url')
def save_page(url):
with tempfile.TemporaryDirectory() as tempdir:
response = requests.get(url)
doc = Document(response.text)
nice_name = slugify.slugify(doc.title())
doc_dom = BeautifulSoup(doc.summary(), features='lxml')
# save images
for i, img in enumerate(doc_dom.find_all('img')):
img_url = urllib.parse.urlparse(img['src'])
_, extension = os.path.splitext(img_url.path)
if not extension:
echo('No file extension for img src, leaving as is: ' + img_url.geturl(), err=True)
else:
if img_url.hostname:
img_resp = requests.get(img_url.geturl())
else:
img_resp = requests.get(urllib.parse.urljoin(url, img_url.geturl()))
assert img_resp.ok
saved_path = f'{i}{extension}'
with open(os.path.join(tempdir, saved_path), 'wb') as f:
f.write(img_resp.content)
img['src'] = saved_path
# update relative links to point at old content
for a in doc_dom.find_all('a'):
if 'href' not in a:
echo('Malformed a tag - no href - skipping: ' + str(a), err=True)
continue
a_url = urllib.parse.urlparse(a['href'])
if not a_url.hostname:
a['href'] = urllib.parse.urljoin(url, a['href'])
with open(os.path.join(tempdir, nice_name + '.html'), 'w') as f:
f.write(str(doc_dom))
with zipfile.ZipFile(nice_name + '.zip', 'w') as zipf:
zipf.writestr(os.path.join(nice_name, 'meta'), '\n'.join([url, str(datetime.datetime.utcnow()) + ' UTC']))
for p in os.listdir(tempdir):
zipf.write(os.path.join(tempdir, p), arcname=os.path.join(nice_name, p))
if __name__ == '__main__':
save_page()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment