Skip to content

Instantly share code, notes, and snippets.

@oogali
Created January 25, 2014 07:03
Show Gist options
  • Save oogali/8612811 to your computer and use it in GitHub Desktop.
Save oogali/8612811 to your computer and use it in GitHub Desktop.
A basic attempt at using the newspaper module
#!/usr/bin/env python
import sys
import logging
import newspaper
def main(argv=None):
if argv is None:
argv = sys.argv
if len(argv) != 2:
print '{} <url>'.format(argv[0])
return -1
logging.basicConfig(filename='newspaper.log', format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', level=logging.INFO)
paper = newspaper.build(argv[1])
print '==> articles <=='
for article in paper.articles:
logging.info('starting article download')
article.download()
logging.info('finished article download')
print '** {} **'.format(article.title.encode('ascii', 'ignore'))
print 'URL: {}'.format(article.url)
logging.info('starting article parsing')
article.parse()
logging.info('finished article parsing')
print 'Authors: {}'.format(', '.join(article.authors))
logging.info('starting article neuro-linguistic processing')
article.nlp()
logging.info('finishing article neuro-linguistic processing')
print 'Summary: {}'.format(article.summary.encode('ascii', 'ignore'))
print 'Keywords: {}'.format(', '.join(article.keywords))
print
print '==> categories <=='
for category in paper.category_urls():
print category
return 0
if __name__ == '__main__':
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment