Last active
January 5, 2024 19:48
-
-
Save cyberandy/807d5623d842a44c6010af92c478963e to your computer and use it in GitHub Desktop.
A super-simple python script to read Google News RSS feeds and store data in a CSV file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Expected use >> python read-GNews.py -q [query] -l [language] -p [country] | |
# The following command will search for the latest news written in German from Austria about "Redbull" | |
# python read-GNews.py -q Redbull -l de -p AT | |
# | |
# Queries can be provided as strings using quotation marks >> python read-GNews.py -q "Redbull Media House" -l de | |
# Multiple queries can be executed at once >> python read-GNews.py -q "Redbull Media House" -q Redbull -l de -p at -p de | |
# The script will save a CSV file containing Title, Link, pubDate, Description, Source and Alexa Traffic Rank. | |
import feedparser | |
import time | |
import sys | |
import pandas as pd | |
import re | |
import urllib | |
import urllib.request as ur | |
import argparse | |
import bs4 | |
# Feed URL | |
base_url = 'https://news.google.com/rss/search?q=' | |
# Get the parameters | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-q', action='append', dest='queries', nargs='+', | |
default=[], | |
help='Add all queries', | |
) | |
parser.add_argument('-l', action='store', dest='language', | |
default="en", | |
help='Store language') | |
parser.add_argument('-p', action='append', dest='locations', nargs='+', | |
default=[], | |
help='Add all places') | |
parser.add_argument('--version', action='version', version='%(prog)s 1.0') | |
# Get Alexa Rank - remember it only works from USA so you need a proxy | |
def getMetrics(url): | |
cleanDomain = '/'.join(url.split('/')[:3]) | |
try: | |
alexa_rank = bs4.BeautifulSoup(ur.urlopen("http://data.alexa.com/data?cli=10&dat=s&url="+ url), "xml").find("REACH")["RANK"] | |
except: | |
alexa_rank = None | |
return alexa_rank | |
# HTML cleanup function | |
def cleanhtml(raw_html): | |
cleanr = re.compile('<.*?>') | |
cleantext = re.sub(cleanr, '', raw_html) | |
return cleantext | |
# Prepare the data frame to store the items | |
d = [] | |
# Access the feed and store data in d | |
def readFeed(url,query): | |
feed = feedparser.parse(url) | |
# Loop items in the feed | |
for post in feed.entries: | |
title = post.title | |
link = post.link | |
# Converting published date to aaaa/mm/dd | |
pubDate = "%d/%02d/%02d" % (post.published_parsed.tm_year,\ | |
post.published_parsed.tm_mon, \ | |
post.published_parsed.tm_mday) | |
description = cleanhtml(post.summary) | |
source = post.source.title | |
# Get Alexa Rank | |
alexa_rank = getMetrics(link) | |
d.append((title, link, pubDate, description, source, query, alexa_rank)) | |
print(d) | |
# Add delay between calls | |
time.sleep(2) | |
return d | |
# Get the parameters | |
args = parser.parse_args() | |
# Set the language (default = "en") | |
language = args.language.lower() | |
# Make sure there is at least one query | |
if len(args.queries) == 0: | |
print("Please add at least one query using the -q parameter") | |
exit | |
# Looping the different combination of queries and places | |
# Make sure there is at least one place | |
if len(args.locations) > 0: | |
# Looping queries and places | |
for a in args.queries: | |
for b in args.locations: | |
query = ''.join(map(str, a)) | |
# URL encode the query and add quotes around it | |
encoded_query = '"' + urllib.parse.quote_plus(query) + '"' | |
place = urllib.parse.quote_plus(''.join(map(str, b)).upper() + ":" + ''.join(map(str, b)).lower()) | |
# Compose the URL | |
url = base_url + encoded_query + "&hl=" + language + "&ceid=" + place | |
print("Reading now: ", url) | |
# Read the Feed | |
readFeed(url, query) | |
else: | |
# Just use the query(ies) | |
for a in args.queries: | |
query = ''.join(map(str, a)) | |
# URL encode the query and add quotes around it | |
encoded_query = '"' + urllib.parse.quote_plus(query) + '"' | |
# Compose the URL | |
url = base_url + encoded_query | |
print("Reading now: ",url) | |
# Read the Feed | |
readFeed(url, query) | |
# Set the file name | |
cleanQuery = re.sub('\W+','', query) | |
file_name = cleanQuery + ".csv" | |
df = pd.DataFrame(d, columns=('Title', 'Link', 'pubDate', 'Description','Source', 'Query', 'Alexa Rank')) | |
# Remove all rows with the same link - you might want to comment this when using different keywords | |
df.drop_duplicates(subset ="Link", | |
keep = False, inplace = True) | |
# Store data to CSV | |
df.to_csv(file_name, encoding='utf-8', index=False) | |
print(len(df), "Articles saved on ", file_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment