Created
July 14, 2023 08:38
-
-
Save eliasdabbas/ba509407b52260c5141d575486795e3c to your computer and use it in GitHub Desktop.
Organize links in an advertools crawl DataFrame
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def link_summary(crawldf, internal_url_regex=None): | |
"""Get a DataFrame summary of links from a crawl DataFrame | |
Parameters: | |
----------- | |
crawldf : DataFrame | |
A DataFrame of a website crawled with advertools. | |
internal_url_regex : str | |
A regular expression for identifying if a link is internal or not. | |
For example if your website is example.com, this would be "example.com". | |
Returns: | |
-------- | |
link_df : pandas.DataFrame | |
""" | |
link_df = pd.merge( | |
crawldf[['url']], | |
crawldf.filter(regex='^links_').apply(lambda s: s.str.split('@@').explode()), | |
left_index=True, right_index=True) | |
link_df['links_nofollow'] = link_df['links_nofollow'].replace({ | |
'True': True, 'False': False, pd.NA: False}) | |
if internal_url_regex is not None: | |
link_df['internal'] = link_df['links_url'].fillna('').str.contains(internal_url_regex, regex=True) | |
link_df.columns = ['url', 'link', 'text', 'nofollow', 'internal'] | |
return link_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment