cyberandy · January 5, 2024 19:48
diff --git a/read-GNews.py b/read-GNews.py
 # Expected use >> python read-GNews.py -q [query] -l [language] -p [country] 
 # The following command will search for the latest news written in German from Austria about "Redbull" 
 # python read-GNews.py -q Redbull -l de -p AT
 #
 # Queries can be provided as strings using quotation marks >> python read-GNews.py -q "Redbull Media House" -l de
 # Multiple queries can be executed at once >> python read-GNews.py -q "Redbull Media House" -q Redbull -l de -p at -p de
 # The script will save a CSV file containing Title, Link, pubDate, Description, Source and Alexa Traffic Rank. 

 import feedparser
 import time
 import sys
 import pandas as pd
 import re
 import urllib
 import urllib.request as ur
 import argparse
 import bs4

 # Feed URL
 base_url = 'https://news.google.com/rss/search?q='

 # Get the parameters

 parser = argparse.ArgumentParser()

 parser.add_argument('-q', action='append', dest='queries', nargs='+',
                    default=[],
                    help='Add all queries',
                    )

 parser.add_argument('-l', action='store', dest='language',
                    default="en",
                    help='Store language')

 parser.add_argument('-p', action='append', dest='locations', nargs='+',
                    default=[],
                    help='Add all places')


 parser.add_argument('--version', action='version', version='%(prog)s 1.0')

 # Get Alexa Rank - remember it only works from USA so you need a proxy
 def getMetrics(url):
    cleanDomain = '/'.join(url.split('/')[:3])
    try:
        alexa_rank = bs4.BeautifulSoup(ur.urlopen("http://data.alexa.com/data?cli=10&dat=s&url="+ url), "xml").find("REACH")["RANK"]
    except:
        alexa_rank = None
    return alexa_rank
 
 # HTML cleanup function
 def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext

 # Prepare the data frame to store the items
 d = []

 # Access the feed and store data in d
 def readFeed(url,query):

    feed = feedparser.parse(url)

    # Loop items in the feed
    for post in feed.entries:
        title = post.title
        link = post.link
        # Converting published date to aaaa/mm/dd
        pubDate = "%d/%02d/%02d" % (post.published_parsed.tm_year,\
            post.published_parsed.tm_mon, \
            post.published_parsed.tm_mday)

        description = cleanhtml(post.summary)
        source = post.source.title
        # Get Alexa Rank
        alexa_rank = getMetrics(link)
        d.append((title, link, pubDate, description, source, query, alexa_rank))
        print(d)
    
    # Add delay between calls
    time.sleep(2)
    return d

 # Get the parameters
 args = parser.parse_args()

 # Set the language (default = "en")
 language = args.language.lower()

 # Make sure there is at least one query
 if len(args.queries) == 0:
    print("Please add at least one query using the -q parameter")
    exit

 # Looping the different combination of queries and places

 # Make sure there is at least one place
 if len(args.locations) > 0:
    # Looping queries and places 
    for a in args.queries:
        for b in args.locations:
            query = ''.join(map(str, a))
            # URL encode the query and add quotes around it
            encoded_query = '"' + urllib.parse.quote_plus(query) + '"'
            place = urllib.parse.quote_plus(''.join(map(str, b)).upper() + ":" + ''.join(map(str, b)).lower()) 
            # Compose the URL
            url = base_url + encoded_query + "&hl=" + language + "&ceid=" + place 
            print("Reading now: ", url)
            # Read the Feed
            readFeed(url, query)
 else: 
    # Just use the query(ies)
    for a in args.queries:   
        query = ''.join(map(str, a))
        # URL encode the query and add quotes around it
        encoded_query = '"' + urllib.parse.quote_plus(query) + '"'        
        # Compose the URL    
        url = base_url + encoded_query
        print("Reading now: ",url)
        # Read the Feed
        readFeed(url, query)

 # Set the file name
 cleanQuery = re.sub('\W+','', query)
 file_name = cleanQuery + ".csv"

 df = pd.DataFrame(d, columns=('Title', 'Link', 'pubDate', 'Description','Source', 'Query', 'Alexa Rank'))

 # Remove all rows with the same link - you might want to comment this when using different keywords
 df.drop_duplicates(subset ="Link", 
                     keep = False, inplace = True)
        
 # Store data to CSV
 df.to_csv(file_name, encoding='utf-8', index=False)
 print(len(df), "Articles saved on ", file_name)
	# Expected use >> python read-GNews.py -q [query] -l [language] -p [country]
	# The following command will search for the latest news written in German from Austria about "Redbull"
	# python read-GNews.py -q Redbull -l de -p AT
	#
	# Queries can be provided as strings using quotation marks >> python read-GNews.py -q "Redbull Media House" -l de
	# Multiple queries can be executed at once >> python read-GNews.py -q "Redbull Media House" -q Redbull -l de -p at -p de
	# The script will save a CSV file containing Title, Link, pubDate, Description, Source and Alexa Traffic Rank.

	import feedparser
	import time
	import sys
	import pandas as pd
	import re
	import urllib
	import urllib.request as ur
	import argparse
	import bs4

	# Feed URL
	base_url = 'https://news.google.com/rss/search?q='

	# Get the parameters

	parser = argparse.ArgumentParser()

	parser.add_argument('-q', action='append', dest='queries', nargs='+',
	default=[],
	help='Add all queries',
	)

	parser.add_argument('-l', action='store', dest='language',
	default="en",
	help='Store language')

	parser.add_argument('-p', action='append', dest='locations', nargs='+',
	default=[],
	help='Add all places')


	parser.add_argument('--version', action='version', version='%(prog)s 1.0')

	# Get Alexa Rank - remember it only works from USA so you need a proxy
	def getMetrics(url):
	cleanDomain = '/'.join(url.split('/')[:3])
	try:
	alexa_rank = bs4.BeautifulSoup(ur.urlopen("http://data.alexa.com/data?cli=10&dat=s&url="+ url), "xml").find("REACH")["RANK"]
	except:
	alexa_rank = None
	return alexa_rank

	# HTML cleanup function
	def cleanhtml(raw_html):
	cleanr = re.compile('<.*?>')
	cleantext = re.sub(cleanr, '', raw_html)
	return cleantext

	# Prepare the data frame to store the items
	d = []

	# Access the feed and store data in d
	def readFeed(url,query):

	feed = feedparser.parse(url)

	# Loop items in the feed
	for post in feed.entries:
	title = post.title
	link = post.link
	# Converting published date to aaaa/mm/dd
	pubDate = "%d/%02d/%02d" % (post.published_parsed.tm_year,\
	post.published_parsed.tm_mon, \
	post.published_parsed.tm_mday)

	description = cleanhtml(post.summary)
	source = post.source.title
	# Get Alexa Rank
	alexa_rank = getMetrics(link)
	d.append((title, link, pubDate, description, source, query, alexa_rank))
	print(d)

	# Add delay between calls
	time.sleep(2)
	return d

	# Get the parameters
	args = parser.parse_args()

	# Set the language (default = "en")
	language = args.language.lower()

	# Make sure there is at least one query
	if len(args.queries) == 0:
	print("Please add at least one query using the -q parameter")
	exit

	# Looping the different combination of queries and places

	# Make sure there is at least one place
	if len(args.locations) > 0:
	# Looping queries and places
	for a in args.queries:
	for b in args.locations:
	query = ''.join(map(str, a))
	# URL encode the query and add quotes around it
	encoded_query = '"' + urllib.parse.quote_plus(query) + '"'
	place = urllib.parse.quote_plus(''.join(map(str, b)).upper() + ":" + ''.join(map(str, b)).lower())
	# Compose the URL
	url = base_url + encoded_query + "&hl=" + language + "&ceid=" + place
	print("Reading now: ", url)
	# Read the Feed
	readFeed(url, query)
	else:
	# Just use the query(ies)
	for a in args.queries:
	query = ''.join(map(str, a))
	# URL encode the query and add quotes around it
	encoded_query = '"' + urllib.parse.quote_plus(query) + '"'
	# Compose the URL
	url = base_url + encoded_query
	print("Reading now: ",url)
	# Read the Feed
	readFeed(url, query)

	# Set the file name
	cleanQuery = re.sub('\W+','', query)
	file_name = cleanQuery + ".csv"

	df = pd.DataFrame(d, columns=('Title', 'Link', 'pubDate', 'Description','Source', 'Query', 'Alexa Rank'))

	# Remove all rows with the same link - you might want to comment this when using different keywords
	df.drop_duplicates(subset ="Link",
	keep = False, inplace = True)

	# Store data to CSV
	df.to_csv(file_name, encoding='utf-8', index=False)
	print(len(df), "Articles saved on ", file_name)