Skip to content

Instantly share code, notes, and snippets.

@jaflo
Created May 6, 2020 03:51
Show Gist options
  • Save jaflo/8af4ebf698977c181ac9b91c1e2fa2b0 to your computer and use it in GitHub Desktop.
Save jaflo/8af4ebf698977c181ac9b91c1e2fa2b0 to your computer and use it in GitHub Desktop.
Export Instapaper to HTML & PDF

Use this to automatically scrape all of your saved Instapaper articles locally as HTML and PDF files. I originally wrote this to read my saved documents on my reMarkable tablet. Instapaper does not have an option to export all my stuff as PDF as far as I could tell (the built-in options only export a subset).

You will need to have the following packages installed:

Configure your username and password, then run the script. It will go through all articles shown on your home page and download the copy Instapaper has stored into a folder called output as HTML file and convert it into a PDF. You can customize the look by updating the included styles.css file. Any errors will be reported and logged to failed.txt. Errors might be due to parsing errors on Instapaper's side or due to PDF conversion issues. I am not sure on how to fix the first, but the script will retry a couple of times in the second case.

On my computer this took about 1-10 seconds per article and up to a minute when PDFs needed to be remade. You might need to quit and rerun if it takes longer than that. It should resume from where it left off if it detects a PDF has already been created. This was written in an afternoon, apologies for any issues.

#!/usr/bin/env python
# coding: utf-8
import requests
import time
from bs4 import BeautifulSoup
import pdfkit
import os
s = requests.Session()
s.post("https://www.instapaper.com/user/login", data={
"username": "YOUR_USERNAME",
"password": "YOUR_PASSWORD",
"keep_logged_in": "yes"
})
base = "./output/"
def get_ids(page=1):
r = s.get("https://www.instapaper.com/u/" + str(page))
soup = BeautifulSoup(r.text, "html.parser")
articles = soup.find(id="article_list").find_all("article")
ids = [i["id"].replace("article_", "") for i in articles]
has_more = soup.find(class_="paginate_older") is not None
return ids, has_more
def get_article(id):
r = s.get("https://www.instapaper.com/read/" + str(id))
soup = BeautifulSoup(r.text, "html.parser")
title = soup.find(id="titlebar").find("h1").getText()
origin = soup.find(id="titlebar").find(class_="origin_line").getText()
content = soup.find(id="story").decode_contents()
return {
"title": title.strip(),
"origin": origin.strip(),
"content": content.strip()
}
def article_converted(id):
for file_name in os.listdir(base):
if file_name.startswith(id) and file_name.endswith(".pdf"):
return base + os.path.basename(file_name)
return None
def download_article(id):
article = get_article(id)
file_name = id + " " + article["title"]
file_name = "".join([c for c in file_name if c.isalpha()
or c.isdigit() or c == " "]).rstrip()
file_name = base + file_name + ".html"
with open(file_name, "w") as file:
file.write("<h1>%s</h1>" % (article["title"]))
file.write("<div id='origin'>%s · %s</div>" % (article["origin"], id))
file.write(article["content"])
return file_name
def convert_to_pdf(file_name):
new_name = file_name[:-5] + ".pdf"
margin = "0.75in"
options = {
"page-size": "Letter",
"margin-top": margin,
"margin-right": margin,
"margin-bottom": margin,
"margin-left": margin,
"encoding": "UTF-8",
"no-outline": None,
"user-style-sheet": "styles.css",
"load-error-handling": "ignore",
"quiet": "",
}
pdfkit.from_file(file_name, new_name, options=options)
return new_name
has_more = True
page = 1
failure_log = open("failed.txt", "a+")
while has_more:
print("Page " + str(page))
ids, has_more = get_ids(page)
for id in ids:
print(" " + id + ": ", end="")
existing_file = article_converted(id)
if existing_file:
print("exists")
else:
start = time.time()
try:
file_name = download_article(id)
except Exception as e:
print("failed!")
print(e)
failure_log.write("%s\t%s\n" % (id, str(e)))
failure_log.flush()
continue
retries = 10
while True:
try:
convert_to_pdf(file_name)
except Exception as e:
retries -= 1
if retries < 0:
print("failed!")
print(e)
failure_log.write("%s\t%s\n" % (id, str(e)))
failure_log.flush()
break
continue
break
duration = time.time() - start
print(str(round(duration, 2)) + " seconds")
if duration < 1: # wait a second
time.sleep(1 - duration)
page += 1
html,
body {
margin: 0;
padding: 0;
}
html {
font-family: Georgia, "Times New Roman", Times, serif;
font-size: 20px;
line-height: 1.5;
word-wrap: break-word;
-webkit-hyphens: none;
-ms-hyphens: none;
hyphens: none;
padding: 1em;
background: white;
color: black;
}
#origin {
opacity: 0.6;
margin-bottom: 2em;
}
a {
text-decoration: none;
border-bottom: 2px solid;
color: inherit;
}
a:after {
content: " (" attr(href) ")";
border-bottom: 2px solid white;
color: rgba(0, 0, 0, 0.6);
}
img {
max-width: 100%;
}
pre,
code {
font-size: 16px;
line-height: 1.1em;
font-family: "Monaco", monospace;
}
li p:first-child {
margin: 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment