Created
July 9, 2023 21:52
-
-
Save eliasdabbas/a28d2012899e17ef23adaa0cd40699d8 to your computer and use it in GitHub Desktop.
Get redirect chains from an advertools crawl dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def redirect_chains(crawldf): | |
"""Create a tidy DataFrame for redirects with the columns: | |
url: All the URLs in the redirect chain. | |
status: The status code of each URL. | |
type: "requested", "inermediate", or "crawled". | |
order: 1, 2, 3... up to the number of urls in the redirect chain. | |
redirect_times: The number of redirects in the chain (URLs in the chain minus one). | |
""" | |
redirect_df = (crawldf[['url', 'status', 'redirect_urls', 'redirect_reasons']] | |
.dropna(subset=['redirect_urls', 'redirect_reasons'])) | |
redirect_df['redirect_urls'] = redirect_df['redirect_urls'].str.split('@@') | |
redirect_df['redirect_reasons'] = redirect_df['redirect_reasons'].str.split('@@') | |
for url, redirect_urls in redirect_df[['url', 'redirect_urls']].values: | |
redirect_urls.append(url) | |
for status, redirect_reasons in redirect_df[['status', 'redirect_reasons']].values: | |
redirect_reasons.append(status) | |
redirect_df['order'] = [list(range(1, len(x)+1)) for x in redirect_df['redirect_reasons']] | |
redirect_df['type'] = [ | |
['requested' if o == min(order) else 'crawled' if o == max(order) else 'intermediate' | |
for o in order] | |
for order in redirect_df['order']] | |
redirect_df.columns = ['NA1', 'NA2', 'url', 'status', 'order', 'type'] | |
exploded = redirect_df[['url', 'status', 'order', 'type']].apply(pd.Series.explode) | |
final_df = pd.merge(exploded, crawldf[['redirect_times']], left_index=True, right_index=True) | |
final_df['redirect_times'] = final_df['redirect_times'].astype(int) | |
return final_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment