Last active
April 13, 2023 00:44
-
-
Save eliasdabbas/1d4e24a77669092c780b09b9ff0fa593 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import advertools as adv | |
import adviz | |
# get URLs of the sitemap index | |
nyt = adv.sitemap_to_df('https://nytimes.com/robots.txt', recursive=False) | |
# get URLs of the /sitemap.xml.gz sitemap index | |
nyt_sitemap_index = adv.sitemap_to_df('https://www.nytimes.com/sitemaps/new/sitemap.xml.gz', recursive=False) | |
nyt_2022 = [] | |
errors = [] | |
nyt_2022_urls = nyt_sitemap_index[nyt_sitemap_index['loc'].str.contains('2022')]['loc'] | |
for sitemapurl in nyt_2022_urls: | |
try: | |
tempdf = adv.sitemap_to_df(sitemapurl) | |
nyt_2022.append(tempdf) | |
except Exception as e: | |
errors.append((sitemapurl, str(e))) | |
nyt22 = pd.concat(nyt_2022, ignore_index=True) | |
# create chart (remove dates in URLs /YYYY/MM/DD to get a better topic overview) | |
fig = adviz.url_structure( | |
nyt22['loc'].str.replace('/2022/\d\d/\d\d', '', regex=True), | |
items_per_level=30, | |
theme='seaborn', | |
height=750, | |
title='<b>NYTimes.com</b> - 2022 (52,304 URLs)', | |
domain='nytimes.com') | |
fig.layout.margin.l = 0 | |
fig.layout.margin.r = 0 | |
fig.layout.margin.b = 0 | |
fig.layout.margin.t = 100 | |
fig |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sitemap URL sample