-
-
Save linwoodc3/e12a7fbebfa755e897697165875f8fdb to your computer and use it in GitHub Desktop.
# Author: Linwood Creekmore | |
# Email: [email protected] | |
# Description: Python script to pull content from a website (works on news stories). | |
#Licensed under GNU GPLv3; see https://choosealicense.com/licenses/lgpl-3.0/ for details | |
# Notes | |
""" | |
23 Oct 2017: updated to include readability based on PyCon talk: https://github.com/DistrictDataLabs/PyCon2016/blob/master/notebooks/tutorial/Working%20with%20Text%20Corpora.ipynb | |
18 Jul 2018: added keywords and summary | |
""" | |
################################### | |
# Standard Library imports | |
################################### | |
import re | |
import pytz | |
import datetime | |
import platform | |
################################### | |
# Third party imports | |
################################### | |
import requests | |
from newspaper import Article | |
from bs4 import BeautifulSoup | |
from readability.readability import Document as Paper | |
from requests.packages.urllib3.exceptions import InsecureRequestWarning | |
requests.packages.urllib3.disable_warnings(InsecureRequestWarning) | |
done = {} | |
def textgetter(url): | |
"""Scrapes web news and returns the content | |
Parameters | |
---------- | |
url : str | |
web address to news report | |
Returns | |
------- | |
answer : dict | |
Python dictionary with key/value pairs for: | |
text (str) - Full text of article | |
url (str) - url to article | |
title (str) - extracted title of article | |
author (str) - name of extracted author(s) | |
base (str) - base url of where article was located | |
provider (str) - string of the news provider from url | |
published_date (str,isoformat) - extracted date of article | |
top_image (str) - extracted url of the top image for article | |
""" | |
global done | |
TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li'] | |
# regex for url check | |
s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)') | |
u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$") | |
if s.search(url): | |
site = u.search(s.search(url).group()).group(3) | |
else: | |
site = None | |
answer = {} | |
# check that its an url | |
if s.search(url): | |
if url in done.keys(): | |
yield done[url] | |
pass | |
try: | |
# make a request to the url | |
r = requests.get(url, verify=False, timeout=1) | |
except: | |
# if the url does not return data, set to empty values | |
done[url] = "Unable to reach website." | |
answer['author'] = None | |
answer['base'] = s.search(url).group() | |
answer['provider']=site | |
answer['published_date']=None | |
answer['text'] = "Unable to reach website." | |
answer['title'] = None | |
answer['top_image'] = None | |
answer['url'] = url | |
answer['keywords']=None | |
answer['summary']=None | |
yield answer | |
# if url does not return successfully, set ot empty values | |
if r.status_code != 200: | |
done[url] = "Unable to reach website." | |
answer['author'] = None | |
answer['base'] = s.search(url).group() | |
answer['provider']=site | |
answer['published_date']=None | |
answer['text'] = "Unable to reach website." | |
answer['title'] = None | |
answer['top_image'] = None | |
answer['url'] = url | |
answer['keywords']=None | |
answer['summary']=None | |
# test if length of url content is greater than 500, if so, fill data | |
if len(r.content)>500: | |
# set article url | |
article = Article(url) | |
# test for python version because of html different parameters | |
if int(platform.python_version_tuple()[0])==3: | |
article.download(input_html=r.content) | |
elif int(platform.python_version_tuple()[0])==2: | |
article.download(html=r.content) | |
# parse the url | |
article.parse() | |
article.nlp() | |
# if parse doesn't pull text fill the rest of the data | |
if len(article.text) >= 200: | |
answer['author'] = ", ".join(article.authors) | |
answer['base'] = s.search(url).group() | |
answer['provider']=site | |
answer['published_date'] = article.publish_date | |
answer['keywords']=article.keywords | |
answer['summary']=article.summary | |
# convert the data to isoformat; exception for naive date | |
if isinstance(article.publish_date,datetime.datetime): | |
try: | |
answer['published_date']=article.publish_date.astimezone(pytz.utc).isoformat() | |
except: | |
answer['published_date']=article.publish_date.isoformat() | |
answer['text'] = article.text | |
answer['title'] = article.title | |
answer['top_image'] = article.top_image | |
answer['url'] = url | |
# if previous didn't work, try another library | |
else: | |
doc = Paper(r.content) | |
data = doc.summary() | |
title = doc.title() | |
soup = BeautifulSoup(data, 'lxml') | |
newstext = " ".join([l.text for l in soup.find_all(TAGS)]) | |
# as we did above, pull text if it's greater than 200 length | |
if len(newstext) > 200: | |
answer['author'] = None | |
answer['base'] = s.search(url).group() | |
answer['provider']=site | |
answer['published_date']=None | |
answer['text'] = newstext | |
answer['title'] = title | |
answer['top_image'] = None | |
answer['url'] = url | |
answer['keywords']=None | |
answer['summary']=None | |
# if nothing works above, use beautiful soup | |
else: | |
newstext = " ".join([ | |
l.text | |
for l in soup.find_all( | |
'div', class_='field-item even') | |
]) | |
done[url] = newstext | |
answer['author'] = None | |
answer['base'] = s.search(url).group() | |
answer['provider']=site | |
answer['published_date']=None | |
answer['text'] = newstext | |
answer['title'] = title | |
answer['top_image'] = None | |
answer['url'] = url | |
answer['keywords']=None | |
answer['summary']=None | |
# if nothing works, fill with empty values | |
else: | |
answer['author'] = None | |
answer['base'] = s.search(url).group() | |
answer['provider']=site | |
answer['published_date']=None | |
answer['text'] = 'No text returned' | |
answer['title'] = None | |
answer['top_image'] = None | |
answer['url'] = url | |
answer['keywords']=None | |
answer['summary']=None | |
yield answer | |
yield answer | |
# the else clause to catch if invalid url passed in | |
else: | |
answer['author'] = None | |
answer['base'] = s.search(url).group() | |
answer['provider']=site | |
answer['published_date']=None | |
answer['text'] = 'This is not a proper url' | |
answer['title'] = None | |
answer['top_image'] = None | |
answer['url'] = url | |
answer['keywords']=None | |
answer['summary']=None | |
yield answer | |
Welcome to Python. You might want to use the materials on this page to get started with learning Python. But, I will help you use this code. First, you'll need to make sure you have all the Python libraries installed and I'll take you through the steps. I'm assuming you have Python installed. If not, get that installed using this:
- I advise installing Anaconda to get Python on your system; it's easier
- Hard way -> https://docs.python-guide.org/starting/installation/ (use Python 3)
I am assuming you are using Python 3 from here forward, so these instructions apply to that version. Now that you have Python installed, you need to access it; to access Python:
If you completed everything above you now have Python installed and running; the next step is installing all Python libraries that the code needs, which is listed here. We will use pip
to install everything.
Here is a quick guide to using pip
for installing libraries. You need to access your command line to do this as well; if you don't know how to do that, use this tutorial to access the command line for your computer . Now, here's the code you type to get the install done:
pip install newspaper3k beautifulsoup4 requests readability-lxml
If you made it to this step, you're ready to use the code. First, find a link to any website; I advise using a link to a news article. I looked on usatoday.com and got this link. Now, paste all the code from my function into your python session. Look above to the very top of this page, click the Raw button, copy, and paste the code into your python session. Literally cut and paste it into Python and if you installed all the . That creates the function for you to use. The function only requires one parameter, the url, but I use yield in the function so you need to use a built in function to generate the result. Here is an example of me using my function:
next(textgetter('https://krcrtv.com/news/carr-fire/housing-available-after-the-carr-fire'))
You must use next() or list() to return the result.
So, that's the next you see above, to return my result.
How do you use this? I tried it with a test URL and when I tried to print an item from the generator object that came back I got
for i in textgetter('HTTP://WWW.ACCESSBHM.COM'): print(i)
--> 182 answer['base'] = s.search(url).group()
AttributeError: 'NoneType' object has no attribute 'group'
Traceback (most recent call last):
File "main.py", line 14, in
from readability.readability import Document as Paper
ModuleNotFoundError: No module named 'readability.readability'
Traceback (most recent call last): File "main.py", line 14, in from readability.readability import Document as Paper ModuleNotFoundError: No module named 'readability.readability'
install this package
pip3 install readability-lxml
Hi thanks for the script. But i have some queries about this script. How and where to pass the parameters? Since i dont have enough knowledge in python please help me to get the idea of this code.