Skip to content

Instantly share code, notes, and snippets.

@eliasdabbas
Created July 9, 2023 21:52
Show Gist options
  • Save eliasdabbas/a28d2012899e17ef23adaa0cd40699d8 to your computer and use it in GitHub Desktop.
Save eliasdabbas/a28d2012899e17ef23adaa0cd40699d8 to your computer and use it in GitHub Desktop.
Get redirect chains from an advertools crawl dataset
def redirect_chains(crawldf):
"""Create a tidy DataFrame for redirects with the columns:
url: All the URLs in the redirect chain.
status: The status code of each URL.
type: "requested", "inermediate", or "crawled".
order: 1, 2, 3... up to the number of urls in the redirect chain.
redirect_times: The number of redirects in the chain (URLs in the chain minus one).
"""
redirect_df = (crawldf[['url', 'status', 'redirect_urls', 'redirect_reasons']]
.dropna(subset=['redirect_urls', 'redirect_reasons']))
redirect_df['redirect_urls'] = redirect_df['redirect_urls'].str.split('@@')
redirect_df['redirect_reasons'] = redirect_df['redirect_reasons'].str.split('@@')
for url, redirect_urls in redirect_df[['url', 'redirect_urls']].values:
redirect_urls.append(url)
for status, redirect_reasons in redirect_df[['status', 'redirect_reasons']].values:
redirect_reasons.append(status)
redirect_df['order'] = [list(range(1, len(x)+1)) for x in redirect_df['redirect_reasons']]
redirect_df['type'] = [
['requested' if o == min(order) else 'crawled' if o == max(order) else 'intermediate'
for o in order]
for order in redirect_df['order']]
redirect_df.columns = ['NA1', 'NA2', 'url', 'status', 'order', 'type']
exploded = redirect_df[['url', 'status', 'order', 'type']].apply(pd.Series.explode)
final_df = pd.merge(exploded, crawldf[['redirect_times']], left_index=True, right_index=True)
final_df['redirect_times'] = final_df['redirect_times'].astype(int)
return final_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment