eliasdabbas · March 14, 2022 18:19 · eliasdabbas · Mar 14, 2022
diff --git a/parse_news_sitemaps.py b/parse_news_sitemaps.py
 import datetime

 import advertools as adv
 import pandas as pd


 stopwords = ['to', 'of', 'the', 'in', 'for', 'and', 'on', 'a', 'as', 'with',
             'from', 'over', 'is', 'at', '—', '-', 'be', '2022', '–', 'it', 'by',
             'we', 'why', 'but', 'my', 'how', 'not', 'an', 'are', 'no', 'go',
             'your', 'up', 'his']


 def news_sitemap_wordcount(news_sitemap, name, phrase_len=1, showtop=30,
                           filter_func=lambda df: df):
    news_sitemap = adv.sitemap_to_df(news_sitemap)
    news_sitemap = filter_func(news_sitemap)
    title = adv.word_frequency(news_sitemap['news_title'], rm_words=stopwords,
                               phrase_len=phrase_len)
    nowraw = datetime.datetime.utcnow()
    now = datetime.datetime.strftime(nowraw, '%d %b, %Y')
    return (title[:showtop]
            .set_index(pd.Index(list(range(1, showtop+1))))
            .style.set_caption(
                f'<h2>{name} news topics</h2><h5>{now}</h5>')
            .bar(subset=['abs_freq'], color='lightgray'))

 news_sitemap_urls = [
    ('https://www.ft.com/sitemaps/news.xml', 'FT', lambda x: x),
    ('https://www.nytimes.com/sitemaps/new/news.xml.gz', 'NYTimes', 
     lambda df: df[df['loc'].str.contains('/2022/')]),
    ('https://www.bbc.com/sitemaps/https-index-com-news.xml', 'BBC',
     lambda df: df[df['publication_name'].eq('BBC News')]),
    ('https://www.economist.com/googlenews.xml', 'Economist', lambda x: x),
    ('https://www.bloomberg.com/feeds/bbiz/sitemap_news.xml', 'Bloomberg', lambda x: x),
    ('https://news.sky.com/sitemap/sitemap-news.xml', 'SKY', lambda x: x),
    ('https://www.washingtonpost.com/arcio/news-sitemap/', 'Wash.Post', lambda x: x),
    ('https://www.foxnews.com/sitemap.xml?type=news', 'FOX', lambda x: x)
 ]

 sitemaps_df = pd.DataFrame(news_sitemap_urls, columns=['url', 'name', 'filter_func'])

 final_dfs = []

 for sitemap, name, filterfunc in sitemaps_df.values:
    for ngram in [1, 2]:
        df = news_sitemap_wordcount(sitemap,name,
                                    filter_func=filterfunc,
                                    showtop=20, phrase_len=ngram)
        final_dfs.append(df)
	import datetime

	import advertools as adv
	import pandas as pd


	stopwords = ['to', 'of', 'the', 'in', 'for', 'and', 'on', 'a', 'as', 'with',
	'from', 'over', 'is', 'at', '—', '-', 'be', '2022', '–', 'it', 'by',
	'we', 'why', 'but', 'my', 'how', 'not', 'an', 'are', 'no', 'go',
	'your', 'up', 'his']


	def news_sitemap_wordcount(news_sitemap, name, phrase_len=1, showtop=30,
	filter_func=lambda df: df):
	news_sitemap = adv.sitemap_to_df(news_sitemap)
	news_sitemap = filter_func(news_sitemap)
	title = adv.word_frequency(news_sitemap['news_title'], rm_words=stopwords,
	phrase_len=phrase_len)
	nowraw = datetime.datetime.utcnow()
	now = datetime.datetime.strftime(nowraw, '%d %b, %Y')
	return (title[:showtop]
	.set_index(pd.Index(list(range(1, showtop+1))))
	.style.set_caption(
	f'<h2>{name} news topics</h2><h5>{now}</h5>')
	.bar(subset=['abs_freq'], color='lightgray'))

	news_sitemap_urls = [
	('https://www.ft.com/sitemaps/news.xml', 'FT', lambda x: x),
	('https://www.nytimes.com/sitemaps/new/news.xml.gz', 'NYTimes',
	lambda df: df[df['loc'].str.contains('/2022/')]),
	('https://www.bbc.com/sitemaps/https-index-com-news.xml', 'BBC',
	lambda df: df[df['publication_name'].eq('BBC News')]),
	('https://www.economist.com/googlenews.xml', 'Economist', lambda x: x),
	('https://www.bloomberg.com/feeds/bbiz/sitemap_news.xml', 'Bloomberg', lambda x: x),
	('https://news.sky.com/sitemap/sitemap-news.xml', 'SKY', lambda x: x),
	('https://www.washingtonpost.com/arcio/news-sitemap/', 'Wash.Post', lambda x: x),
	('https://www.foxnews.com/sitemap.xml?type=news', 'FOX', lambda x: x)
	]

	sitemaps_df = pd.DataFrame(news_sitemap_urls, columns=['url', 'name', 'filter_func'])

	final_dfs = []

	for sitemap, name, filterfunc in sitemaps_df.values:
	for ngram in [1, 2]:
	df = news_sitemap_wordcount(sitemap,name,
	filter_func=filterfunc,
	showtop=20, phrase_len=ngram)
	final_dfs.append(df)