|
#!/usr/bin/env python |
|
# coding: utf-8 |
|
|
|
import requests |
|
import time |
|
from bs4 import BeautifulSoup |
|
import pdfkit |
|
import os |
|
|
|
s = requests.Session() |
|
s.post("https://www.instapaper.com/user/login", data={ |
|
"username": "YOUR_USERNAME", |
|
"password": "YOUR_PASSWORD", |
|
"keep_logged_in": "yes" |
|
}) |
|
base = "./output/" |
|
|
|
|
|
def get_ids(page=1): |
|
r = s.get("https://www.instapaper.com/u/" + str(page)) |
|
soup = BeautifulSoup(r.text, "html.parser") |
|
|
|
articles = soup.find(id="article_list").find_all("article") |
|
ids = [i["id"].replace("article_", "") for i in articles] |
|
has_more = soup.find(class_="paginate_older") is not None |
|
return ids, has_more |
|
|
|
|
|
def get_article(id): |
|
r = s.get("https://www.instapaper.com/read/" + str(id)) |
|
soup = BeautifulSoup(r.text, "html.parser") |
|
|
|
title = soup.find(id="titlebar").find("h1").getText() |
|
origin = soup.find(id="titlebar").find(class_="origin_line").getText() |
|
content = soup.find(id="story").decode_contents() |
|
return { |
|
"title": title.strip(), |
|
"origin": origin.strip(), |
|
"content": content.strip() |
|
} |
|
|
|
|
|
def article_converted(id): |
|
for file_name in os.listdir(base): |
|
if file_name.startswith(id) and file_name.endswith(".pdf"): |
|
return base + os.path.basename(file_name) |
|
return None |
|
|
|
|
|
def download_article(id): |
|
article = get_article(id) |
|
file_name = id + " " + article["title"] |
|
file_name = "".join([c for c in file_name if c.isalpha() |
|
or c.isdigit() or c == " "]).rstrip() |
|
file_name = base + file_name + ".html" |
|
|
|
with open(file_name, "w") as file: |
|
file.write("<h1>%s</h1>" % (article["title"])) |
|
file.write("<div id='origin'>%s · %s</div>" % (article["origin"], id)) |
|
file.write(article["content"]) |
|
|
|
return file_name |
|
|
|
|
|
def convert_to_pdf(file_name): |
|
new_name = file_name[:-5] + ".pdf" |
|
margin = "0.75in" |
|
options = { |
|
"page-size": "Letter", |
|
"margin-top": margin, |
|
"margin-right": margin, |
|
"margin-bottom": margin, |
|
"margin-left": margin, |
|
"encoding": "UTF-8", |
|
"no-outline": None, |
|
"user-style-sheet": "styles.css", |
|
"load-error-handling": "ignore", |
|
"quiet": "", |
|
} |
|
|
|
pdfkit.from_file(file_name, new_name, options=options) |
|
return new_name |
|
|
|
|
|
has_more = True |
|
page = 1 |
|
|
|
failure_log = open("failed.txt", "a+") |
|
|
|
while has_more: |
|
print("Page " + str(page)) |
|
ids, has_more = get_ids(page) |
|
for id in ids: |
|
print(" " + id + ": ", end="") |
|
existing_file = article_converted(id) |
|
if existing_file: |
|
print("exists") |
|
else: |
|
start = time.time() |
|
try: |
|
file_name = download_article(id) |
|
except Exception as e: |
|
print("failed!") |
|
print(e) |
|
failure_log.write("%s\t%s\n" % (id, str(e))) |
|
failure_log.flush() |
|
continue |
|
retries = 10 |
|
while True: |
|
try: |
|
convert_to_pdf(file_name) |
|
except Exception as e: |
|
retries -= 1 |
|
if retries < 0: |
|
print("failed!") |
|
print(e) |
|
failure_log.write("%s\t%s\n" % (id, str(e))) |
|
failure_log.flush() |
|
break |
|
continue |
|
break |
|
duration = time.time() - start |
|
print(str(round(duration, 2)) + " seconds") |
|
if duration < 1: # wait a second |
|
time.sleep(1 - duration) |
|
page += 1 |