jaflo · May 6, 2020 03:51
diff --git a/description.md b/description.md
diff --git a/scrape.py b/scrape.py
 #!/usr/bin/env python
 # coding: utf-8

 import requests
 import time
 from bs4 import BeautifulSoup
 import pdfkit
 import os

 s = requests.Session()
 s.post("https://www.instapaper.com/user/login", data={
    "username": "YOUR_USERNAME",
    "password": "YOUR_PASSWORD",
    "keep_logged_in": "yes"
 })
 base = "./output/"


 def get_ids(page=1):
    r = s.get("https://www.instapaper.com/u/" + str(page))
    soup = BeautifulSoup(r.text, "html.parser")

    articles = soup.find(id="article_list").find_all("article")
    ids = [i["id"].replace("article_", "") for i in articles]
    has_more = soup.find(class_="paginate_older") is not None
    return ids, has_more


 def get_article(id):
    r = s.get("https://www.instapaper.com/read/" + str(id))
    soup = BeautifulSoup(r.text, "html.parser")

    title = soup.find(id="titlebar").find("h1").getText()
    origin = soup.find(id="titlebar").find(class_="origin_line").getText()
    content = soup.find(id="story").decode_contents()
    return {
        "title": title.strip(),
        "origin": origin.strip(),
        "content": content.strip()
    }


 def article_converted(id):
    for file_name in os.listdir(base):
        if file_name.startswith(id) and file_name.endswith(".pdf"):
            return base + os.path.basename(file_name)
    return None


 def download_article(id):
    article = get_article(id)
    file_name = id + " " + article["title"]
    file_name = "".join([c for c in file_name if c.isalpha()
                         or c.isdigit() or c == " "]).rstrip()
    file_name = base + file_name + ".html"

    with open(file_name, "w") as file:
        file.write("<h1>%s</h1>" % (article["title"]))
        file.write("<div id='origin'>%s · %s</div>" % (article["origin"], id))
        file.write(article["content"])

    return file_name


 def convert_to_pdf(file_name):
    new_name = file_name[:-5] + ".pdf"
    margin = "0.75in"
    options = {
        "page-size": "Letter",
        "margin-top": margin,
        "margin-right": margin,
        "margin-bottom": margin,
        "margin-left": margin,
        "encoding": "UTF-8",
        "no-outline": None,
        "user-style-sheet": "styles.css",
        "load-error-handling": "ignore",
        "quiet": "",
    }

    pdfkit.from_file(file_name, new_name, options=options)
    return new_name


 has_more = True
 page = 1

 failure_log = open("failed.txt", "a+")

 while has_more:
    print("Page " + str(page))
    ids, has_more = get_ids(page)
    for id in ids:
        print("  " + id + ": ", end="")
        existing_file = article_converted(id)
        if existing_file:
            print("exists")
        else:
            start = time.time()
            try:
                file_name = download_article(id)
            except Exception as e:
                print("failed!")
                print(e)
                failure_log.write("%s\t%s\n" % (id, str(e)))
                failure_log.flush()
                continue
            retries = 10
            while True:
                try:
                    convert_to_pdf(file_name)
                except Exception as e:
                    retries -= 1
                    if retries < 0:
                        print("failed!")
                        print(e)
                        failure_log.write("%s\t%s\n" % (id, str(e)))
                        failure_log.flush()
                        break
                    continue
                break
            duration = time.time() - start
            print(str(round(duration, 2)) + " seconds")
            if duration < 1:  # wait a second
                time.sleep(1 - duration)
    page += 1
diff --git a/styles.css b/styles.css
 html,
 body {
 	margin: 0;
 	padding: 0;
 }

 html {
 	font-family: Georgia, "Times New Roman", Times, serif;
 	font-size: 20px;
 	line-height: 1.5;
 	word-wrap: break-word;
 	-webkit-hyphens: none;
 	-ms-hyphens: none;
 	hyphens: none;
 	padding: 1em;
 	background: white;
 	color: black;
 }

 #origin {
 	opacity: 0.6;
 	margin-bottom: 2em;
 }

 a {
 	text-decoration: none;
 	border-bottom: 2px solid;
 	color: inherit;
 }

 a:after {
 	content: " (" attr(href) ")";
 	border-bottom: 2px solid white;
 	color: rgba(0, 0, 0, 0.6);
 }

 img {
 	max-width: 100%;
 }

 pre,
 code {
 	font-size: 16px;
 	line-height: 1.1em;
 	font-family: "Monaco", monospace;
 }

 li p:first-child {
 	margin: 0;
 }
	#!/usr/bin/env python
	# coding: utf-8

	import requests
	import time
	from bs4 import BeautifulSoup
	import pdfkit
	import os

	s = requests.Session()
	s.post("https://www.instapaper.com/user/login", data={
	"username": "YOUR_USERNAME",
	"password": "YOUR_PASSWORD",
	"keep_logged_in": "yes"
	})
	base = "./output/"


	def get_ids(page=1):
	r = s.get("https://www.instapaper.com/u/" + str(page))
	soup = BeautifulSoup(r.text, "html.parser")

	articles = soup.find(id="article_list").find_all("article")
	ids = [i["id"].replace("article_", "") for i in articles]
	has_more = soup.find(class_="paginate_older") is not None
	return ids, has_more


	def get_article(id):
	r = s.get("https://www.instapaper.com/read/" + str(id))
	soup = BeautifulSoup(r.text, "html.parser")

	title = soup.find(id="titlebar").find("h1").getText()
	origin = soup.find(id="titlebar").find(class_="origin_line").getText()
	content = soup.find(id="story").decode_contents()
	return {
	"title": title.strip(),
	"origin": origin.strip(),
	"content": content.strip()
	}


	def article_converted(id):
	for file_name in os.listdir(base):
	if file_name.startswith(id) and file_name.endswith(".pdf"):
	return base + os.path.basename(file_name)
	return None


	def download_article(id):
	article = get_article(id)
	file_name = id + " " + article["title"]
	file_name = "".join([c for c in file_name if c.isalpha()
	or c.isdigit() or c == " "]).rstrip()
	file_name = base + file_name + ".html"

	with open(file_name, "w") as file:
	file.write("<h1>%s</h1>" % (article["title"]))
	file.write("<div id='origin'>%s · %s</div>" % (article["origin"], id))
	file.write(article["content"])

	return file_name


	def convert_to_pdf(file_name):
	new_name = file_name[:-5] + ".pdf"
	margin = "0.75in"
	options = {
	"page-size": "Letter",
	"margin-top": margin,
	"margin-right": margin,
	"margin-bottom": margin,
	"margin-left": margin,
	"encoding": "UTF-8",
	"no-outline": None,
	"user-style-sheet": "styles.css",
	"load-error-handling": "ignore",
	"quiet": "",
	}

	pdfkit.from_file(file_name, new_name, options=options)
	return new_name


	has_more = True
	page = 1

	failure_log = open("failed.txt", "a+")

	while has_more:
	print("Page " + str(page))
	ids, has_more = get_ids(page)
	for id in ids:
	print(" " + id + ": ", end="")
	existing_file = article_converted(id)
	if existing_file:
	print("exists")
	else:
	start = time.time()
	try:
	file_name = download_article(id)
	except Exception as e:
	print("failed!")
	print(e)
	failure_log.write("%s\t%s\n" % (id, str(e)))
	failure_log.flush()
	continue
	retries = 10
	while True:
	try:
	convert_to_pdf(file_name)
	except Exception as e:
	retries -= 1
	if retries < 0:
	print("failed!")
	print(e)
	failure_log.write("%s\t%s\n" % (id, str(e)))
	failure_log.flush()
	break
	continue
	break
	duration = time.time() - start
	print(str(round(duration, 2)) + " seconds")
	if duration < 1: # wait a second
	time.sleep(1 - duration)
	page += 1
	html,
	body {
	margin: 0;
	padding: 0;
	}

	html {
	font-family: Georgia, "Times New Roman", Times, serif;
	font-size: 20px;
	line-height: 1.5;
	word-wrap: break-word;
	-webkit-hyphens: none;
	-ms-hyphens: none;
	hyphens: none;
	padding: 1em;
	background: white;
	color: black;
	}

	#origin {
	opacity: 0.6;
	margin-bottom: 2em;
	}

	a {
	text-decoration: none;
	border-bottom: 2px solid;
	color: inherit;
	}

	a:after {
	content: " (" attr(href) ")";
	border-bottom: 2px solid white;
	color: rgba(0, 0, 0, 0.6);
	}

	img {
	max-width: 100%;
	}

	pre,
	code {
	font-size: 16px;
	line-height: 1.1em;
	font-family: "Monaco", monospace;
	}

	li p:first-child {
	margin: 0;
	}