Created
February 3, 2020 19:10
-
-
Save LarryWachira/7d21b5b754f27f90737553dc94e7bd7c to your computer and use it in GitHub Desktop.
Fetch recent google news articles matching a keyword
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' Script to fetch recent google news articles matching a keyword | |
* Utilizes Google News RSS feed | |
Requirements: | |
requests - pip install requests | |
newspaper - pip install newspaper3k | |
''' | |
import json | |
import re | |
from newspaper import Article | |
import requests | |
import feedparser | |
ALL_NEWS_FEED_URL = "https://news.google.com/news/rss/?ned=us&gl=US&hl=en" | |
def call_url(url): | |
response = None | |
user_agent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 " \ | |
"(KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" | |
try: | |
response = requests.get( | |
url, headers={"User-Agent": user_agent}) | |
except requests.HTTPError as e: | |
print("\nError accessing '{}': {} [{}]".format(url, e.reason, e.code), | |
"\n") | |
except Exception as e: | |
print("\nError accessing '{}'\n".format(url), e, "\n") | |
if response and response.status_code == 200: | |
return response.text | |
def parse_article_from_url(url): | |
article = Article("") | |
source_html = call_url(url) | |
if source_html: | |
article.set_html(source_html) | |
article.parse() | |
if article.text: | |
article_dict = dict(title=article.title, article=article.text) | |
return article_dict | |
def parse_news(keyword=None, include_related_coverage=False): | |
if (keyword is not None and keyword.strip() != ""): | |
keyword = keyword.strip() | |
print("\n Please wait... Fetching news article links" | |
" about '{}' on Google News...\n".format(keyword)) | |
else: | |
print("\n Please wait... Fetching recent news articles links...\n") | |
keyword_url = "https://news.google.com/news/rss/search/section/" \ | |
"q/{0}/{0}?hl=en&gl=US&ned=us".format(keyword) | |
url = keyword_url if keyword else ALL_NEWS_FEED_URL | |
xml_news_feed = call_url(url) | |
parsed_feed = feedparser.parse(xml_news_feed) | |
news_links = [news_item["link"] for news_item in parsed_feed["entries"]] | |
if include_related_coverage: | |
xml_news_links = [news_item["summary"] | |
for news_item in parsed_feed["entries"]] | |
news_links = [] | |
for related_news_links in xml_news_links: | |
all_related_links = re.findall( | |
r'(?<=\shref=")http\S+(?=")', related_news_links | |
) | |
del all_related_links[-1] | |
news_links.extend(all_related_links) | |
print("\nArticle Links acquired:\n\n", news_links, | |
"\n\nTotal no. of links:", len(news_links), "\n") | |
print("\n...Fetching and parsing articles from acquired links.\n") | |
formatted_response = [parse_article_from_url(link) | |
for link in news_links | |
if not link.endswith((".mp4", ".mp3"))] | |
articles_json = json.dumps( | |
[article for article in formatted_response if article is not None], | |
ensure_ascii=False | |
) | |
print("\nDone!\n\n") | |
return articles_json | |
print(parse_news('kenya')) # Test search |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment