Last active
February 14, 2025 01:49
-
-
Save linwoodc3/e12a7fbebfa755e897697165875f8fdb to your computer and use it in GitHub Desktop.
A python script to scrape text from websites. This works surprisingly well on most news websites when you have the URL to the story. Use GDELT urls for the best results.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Linwood Creekmore | |
# Email: [email protected] | |
# Description: Python script to pull content from a website (works on news stories). | |
#Licensed under GNU GPLv3; see https://choosealicense.com/licenses/lgpl-3.0/ for details | |
# Notes | |
""" | |
23 Oct 2017: updated to include readability based on PyCon talk: https://github.com/DistrictDataLabs/PyCon2016/blob/master/notebooks/tutorial/Working%20with%20Text%20Corpora.ipynb | |
18 Jul 2018: added keywords and summary | |
""" | |
################################### | |
# Standard Library imports | |
################################### | |
import re | |
import pytz | |
import datetime | |
import platform | |
################################### | |
# Third party imports | |
################################### | |
import requests | |
from newspaper import Article | |
from bs4 import BeautifulSoup | |
from readability.readability import Document as Paper | |
from requests.packages.urllib3.exceptions import InsecureRequestWarning | |
requests.packages.urllib3.disable_warnings(InsecureRequestWarning) | |
done = {} | |
def textgetter(url): | |
"""Scrapes web news and returns the content | |
Parameters | |
---------- | |
url : str | |
web address to news report | |
Returns | |
------- | |
answer : dict | |
Python dictionary with key/value pairs for: | |
text (str) - Full text of article | |
url (str) - url to article | |
title (str) - extracted title of article | |
author (str) - name of extracted author(s) | |
base (str) - base url of where article was located | |
provider (str) - string of the news provider from url | |
published_date (str,isoformat) - extracted date of article | |
top_image (str) - extracted url of the top image for article | |
""" | |
global done | |
TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li'] | |
# regex for url check | |
s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)') | |
u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$") | |
if s.search(url): | |
site = u.search(s.search(url).group()).group(3) | |
else: | |
site = None | |
answer = {} | |
# check that its an url | |
if s.search(url): | |
if url in done.keys(): | |
yield done[url] | |
pass | |
try: | |
# make a request to the url | |
r = requests.get(url, verify=False, timeout=1) | |
except: | |
# if the url does not return data, set to empty values | |
done[url] = "Unable to reach website." | |
answer['author'] = None | |
answer['base'] = s.search(url).group() | |
answer['provider']=site | |
answer['published_date']=None | |
answer['text'] = "Unable to reach website." | |
answer['title'] = None | |
answer['top_image'] = None | |
answer['url'] = url | |
answer['keywords']=None | |
answer['summary']=None | |
yield answer | |
# if url does not return successfully, set ot empty values | |
if r.status_code != 200: | |
done[url] = "Unable to reach website." | |
answer['author'] = None | |
answer['base'] = s.search(url).group() | |
answer['provider']=site | |
answer['published_date']=None | |
answer['text'] = "Unable to reach website." | |
answer['title'] = None | |
answer['top_image'] = None | |
answer['url'] = url | |
answer['keywords']=None | |
answer['summary']=None | |
# test if length of url content is greater than 500, if so, fill data | |
if len(r.content)>500: | |
# set article url | |
article = Article(url) | |
# test for python version because of html different parameters | |
if int(platform.python_version_tuple()[0])==3: | |
article.download(input_html=r.content) | |
elif int(platform.python_version_tuple()[0])==2: | |
article.download(html=r.content) | |
# parse the url | |
article.parse() | |
article.nlp() | |
# if parse doesn't pull text fill the rest of the data | |
if len(article.text) >= 200: | |
answer['author'] = ", ".join(article.authors) | |
answer['base'] = s.search(url).group() | |
answer['provider']=site | |
answer['published_date'] = article.publish_date | |
answer['keywords']=article.keywords | |
answer['summary']=article.summary | |
# convert the data to isoformat; exception for naive date | |
if isinstance(article.publish_date,datetime.datetime): | |
try: | |
answer['published_date']=article.publish_date.astimezone(pytz.utc).isoformat() | |
except: | |
answer['published_date']=article.publish_date.isoformat() | |
answer['text'] = article.text | |
answer['title'] = article.title | |
answer['top_image'] = article.top_image | |
answer['url'] = url | |
# if previous didn't work, try another library | |
else: | |
doc = Paper(r.content) | |
data = doc.summary() | |
title = doc.title() | |
soup = BeautifulSoup(data, 'lxml') | |
newstext = " ".join([l.text for l in soup.find_all(TAGS)]) | |
# as we did above, pull text if it's greater than 200 length | |
if len(newstext) > 200: | |
answer['author'] = None | |
answer['base'] = s.search(url).group() | |
answer['provider']=site | |
answer['published_date']=None | |
answer['text'] = newstext | |
answer['title'] = title | |
answer['top_image'] = None | |
answer['url'] = url | |
answer['keywords']=None | |
answer['summary']=None | |
# if nothing works above, use beautiful soup | |
else: | |
newstext = " ".join([ | |
l.text | |
for l in soup.find_all( | |
'div', class_='field-item even') | |
]) | |
done[url] = newstext | |
answer['author'] = None | |
answer['base'] = s.search(url).group() | |
answer['provider']=site | |
answer['published_date']=None | |
answer['text'] = newstext | |
answer['title'] = title | |
answer['top_image'] = None | |
answer['url'] = url | |
answer['keywords']=None | |
answer['summary']=None | |
# if nothing works, fill with empty values | |
else: | |
answer['author'] = None | |
answer['base'] = s.search(url).group() | |
answer['provider']=site | |
answer['published_date']=None | |
answer['text'] = 'No text returned' | |
answer['title'] = None | |
answer['top_image'] = None | |
answer['url'] = url | |
answer['keywords']=None | |
answer['summary']=None | |
yield answer | |
yield answer | |
# the else clause to catch if invalid url passed in | |
else: | |
answer['author'] = None | |
answer['base'] = s.search(url).group() | |
answer['provider']=site | |
answer['published_date']=None | |
answer['text'] = 'This is not a proper url' | |
answer['title'] = None | |
answer['top_image'] = None | |
answer['url'] = url | |
answer['keywords']=None | |
answer['summary']=None | |
yield answer | |
Traceback (most recent call last):
File "main.py", line 14, in
from readability.readability import Document as Paper
ModuleNotFoundError: No module named 'readability.readability'
Traceback (most recent call last): File "main.py", line 14, in from readability.readability import Document as Paper ModuleNotFoundError: No module named 'readability.readability'
install this package
pip3 install readability-lxml
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How do you use this? I tried it with a test URL and when I tried to print an item from the generator object that came back I got
for i in textgetter('HTTP://WWW.ACCESSBHM.COM'): print(i)