Skip to content

Instantly share code, notes, and snippets.

@jdowner
Created December 24, 2017 11:31
Show Gist options
  • Save jdowner/25facbfd1aa3816457ba9c5d2040cd3c to your computer and use it in GitHub Desktop.
Save jdowner/25facbfd1aa3816457ba9c5d2040cd3c to your computer and use it in GitHub Desktop.
url to text
docopt
readability-lxml
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Name:
turl
Usage:
turl [--article-dir=<adir>] <url>
Options:
--article-dir=<adir> where articles are stored [default: ./articles]
"""
import hashlib
import io
import os
import shlex
import subprocess
import sys
import docopt
from readability.readability import Document
def main(argv=sys.argv[1:]):
args = docopt.docopt(__doc__, argv=argv)
url = args["<url>"]
articles = args["--article-dir"]
# How awful to spawn a subprocess to request the webpage. However, I have
# found that using urllib or requests is problematic over a range of
# websites. Mainly the problems are due to a requirement for additional
# headers. But since curl 'just works' I am going with that for now.
html = subprocess.check_output(shlex.split("curl -s {}".format(url)))
title = Document(html).title()
process = subprocess.Popen(
shlex.split("lynx -dump -stdin"),
stdout=subprocess.PIPE,
stdin=subprocess.PIPE,
)
summary = Document(html).summary().encode("utf-8")
article, err = process.communicate(summary)
document = u"url: {}\ntitle: {}\n--\n\n{}"
document = document.format(url, title, article.decode("utf-8"))
hash = hashlib.md5(document.encode("utf-8")).hexdigest()
with io.open(os.path.join(articles, hash), "w") as fp:
fp.write(document)
print(hash)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment