Created
December 24, 2017 11:31
-
-
Save jdowner/25facbfd1aa3816457ba9c5d2040cd3c to your computer and use it in GitHub Desktop.
url to text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
docopt | |
readability-lxml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Name: | |
turl | |
Usage: | |
turl [--article-dir=<adir>] <url> | |
Options: | |
--article-dir=<adir> where articles are stored [default: ./articles] | |
""" | |
import hashlib | |
import io | |
import os | |
import shlex | |
import subprocess | |
import sys | |
import docopt | |
from readability.readability import Document | |
def main(argv=sys.argv[1:]): | |
args = docopt.docopt(__doc__, argv=argv) | |
url = args["<url>"] | |
articles = args["--article-dir"] | |
# How awful to spawn a subprocess to request the webpage. However, I have | |
# found that using urllib or requests is problematic over a range of | |
# websites. Mainly the problems are due to a requirement for additional | |
# headers. But since curl 'just works' I am going with that for now. | |
html = subprocess.check_output(shlex.split("curl -s {}".format(url))) | |
title = Document(html).title() | |
process = subprocess.Popen( | |
shlex.split("lynx -dump -stdin"), | |
stdout=subprocess.PIPE, | |
stdin=subprocess.PIPE, | |
) | |
summary = Document(html).summary().encode("utf-8") | |
article, err = process.communicate(summary) | |
document = u"url: {}\ntitle: {}\n--\n\n{}" | |
document = document.format(url, title, article.decode("utf-8")) | |
hash = hashlib.md5(document.encode("utf-8")).hexdigest() | |
with io.open(os.path.join(articles, hash), "w") as fp: | |
fp.write(document) | |
print(hash) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment