charles-l · March 2, 2024 19:57 · charles-l · Nov 29, 2020
diff --git a/save_page.py b/save_page.py
 #!/usr/bin/env python3

 from bs4 import BeautifulSoup
 from readability import Document
 import click
 from click import echo
 import requests
 import slugify

 import os
 import os.path
 import urllib.parse
 import tempfile
 import zipfile
 import datetime

 @click.command()
 @click.argument('url')
 def save_page(url):
    with tempfile.TemporaryDirectory() as tempdir:
        response = requests.get(url)
        doc = Document(response.text)
        nice_name = slugify.slugify(doc.title())
        doc_dom = BeautifulSoup(doc.summary(), features='lxml')

        # save images
        for i, img in enumerate(doc_dom.find_all('img')):
            img_url = urllib.parse.urlparse(img['src'])
            _, extension = os.path.splitext(img_url.path)
            if not extension:
                echo('No file extension for img src, leaving as is: ' + img_url.geturl(), err=True)
            else:
                if img_url.hostname:
                    img_resp = requests.get(img_url.geturl())
                else:
                    img_resp = requests.get(urllib.parse.urljoin(url, img_url.geturl()))

                assert img_resp.ok

                saved_path = f'{i}{extension}'
                with open(os.path.join(tempdir, saved_path), 'wb') as f:
                    f.write(img_resp.content)

                img['src'] = saved_path

        # update relative links to point at old content
        for a in doc_dom.find_all('a'):
            if 'href' not in a:
                echo('Malformed a tag - no href - skipping: ' + str(a), err=True)
                continue
            a_url = urllib.parse.urlparse(a['href'])
            if not a_url.hostname:
                a['href'] = urllib.parse.urljoin(url, a['href'])

        with open(os.path.join(tempdir, nice_name + '.html'), 'w') as f:
            f.write(str(doc_dom))

        with zipfile.ZipFile(nice_name + '.zip', 'w') as zipf:
            zipf.writestr(os.path.join(nice_name, 'meta'), '\n'.join([url, str(datetime.datetime.utcnow()) + ' UTC']))
            for p in os.listdir(tempdir):
                zipf.write(os.path.join(tempdir, p), arcname=os.path.join(nice_name, p))

 if __name__ == '__main__':
    save_page()
	#!/usr/bin/env python3

	from bs4 import BeautifulSoup
	from readability import Document
	import click
	from click import echo
	import requests
	import slugify

	import os
	import os.path
	import urllib.parse
	import tempfile
	import zipfile
	import datetime

	@click.command()
	@click.argument('url')
	def save_page(url):
	with tempfile.TemporaryDirectory() as tempdir:
	response = requests.get(url)
	doc = Document(response.text)
	nice_name = slugify.slugify(doc.title())
	doc_dom = BeautifulSoup(doc.summary(), features='lxml')

	# save images
	for i, img in enumerate(doc_dom.find_all('img')):
	img_url = urllib.parse.urlparse(img['src'])
	_, extension = os.path.splitext(img_url.path)
	if not extension:
	echo('No file extension for img src, leaving as is: ' + img_url.geturl(), err=True)
	else:
	if img_url.hostname:
	img_resp = requests.get(img_url.geturl())
	else:
	img_resp = requests.get(urllib.parse.urljoin(url, img_url.geturl()))

	assert img_resp.ok

	saved_path = f'{i}{extension}'
	with open(os.path.join(tempdir, saved_path), 'wb') as f:
	f.write(img_resp.content)

	img['src'] = saved_path

	# update relative links to point at old content
	for a in doc_dom.find_all('a'):
	if 'href' not in a:
	echo('Malformed a tag - no href - skipping: ' + str(a), err=True)
	continue
	a_url = urllib.parse.urlparse(a['href'])
	if not a_url.hostname:
	a['href'] = urllib.parse.urljoin(url, a['href'])

	with open(os.path.join(tempdir, nice_name + '.html'), 'w') as f:
	f.write(str(doc_dom))

	with zipfile.ZipFile(nice_name + '.zip', 'w') as zipf:
	zipf.writestr(os.path.join(nice_name, 'meta'), '\n'.join([url, str(datetime.datetime.utcnow()) + ' UTC']))
	for p in os.listdir(tempdir):
	zipf.write(os.path.join(tempdir, p), arcname=os.path.join(nice_name, p))

	if __name__ == '__main__':
	save_page()