Created
December 20, 2018 17:48
-
-
Save aluxian/3f62f30fa56f56ea7e8c23f445233f93 to your computer and use it in GitHub Desktop.
Extract clean text from webpage w/ Mercury API and save to PDF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash -e | |
APIURL="https://mercury.postlight.com/parser" | |
APIKEY="ErUeufK8WIFsi6769SZuQEEn1b6CS4D5JAfrvBhJ" | |
URL="$1" | |
TEMPDIR=$(mktemp -d) | |
# change working dir | |
cd $TEMPDIR | |
# download article with Postlight's Mercury API | |
curl -G --data-urlencode "url=$URL" -H "x-api-key: $APIKEY" $APIURL > result.json | |
# extract data from downloaded json | |
title=$(jq -r .title result.json) | |
content=$(jq -r .content result.json) | |
author=$(sed 's/null/?/g' <<<"$(jq -r .author result.json)") | |
date=$(date -jf "%Y-%m-%d" $(jq -r .date_published result.json | cut -c 1-10) +"%-d %b %Y" 2>/dev/null || echo '?') | |
url=$(jq -r .url result.json) | |
domain=$(jq -r .domain result.json) | |
wordcount=$(jq -r .word_count result.json | awk '{ print ($1 > 5) ? $1 : "?" }') | |
# write html | |
cat <<EOF > result.html | |
<style> | |
html { | |
font-family: Georgia; | |
} | |
img { | |
width: auto; | |
height: auto; | |
max-width: 100%; | |
max-height: 300px; | |
} | |
code { | |
background-color: #eee; | |
} | |
.aligncenter { | |
text-align: center; | |
} | |
.wp-caption-text { | |
font-style: italic; | |
} | |
</style> | |
<h1>$title</h1> | |
<div><b>$wordcount</b> words by <b>$author</b> on <a href="$url"><b>$domain</b></a> at <b>$date</b></div> | |
<hr> | |
$content | |
EOF | |
# filter html | |
xidel --html result.html --xquery 'transform(/, function($e) { | |
if (string($e) = "Subscribe to Praxis, our members-only blog exploring the future of productivity, for just $10/month. Or follow us for free content via Twitter, Facebook, LinkedIn, or YouTube.") then <i></i> else $e | |
})' | sponge result.html | |
# convert html to pdf | |
fname="$title".pdf | |
wkhtmltopdf --lowquality --image-dpi 300 --image-quality 80 --no-outline --encoding 'utf-8' --disable-smart-shrinking result.html "$fname" | |
# import into Notability | |
open -a Notability "$fname" | |
# clean up | |
sleep 5 | |
rm -rf $TEMPDIR |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment