Last active
September 10, 2022 13:17
-
-
Save eliasdabbas/7e90304747a385f75fbea8908718ae69 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import advertools as adv | |
| import pandas as pd | |
| pd.options.display.max_columns = None | |
| homepage = 'https://example.com/' # <--- change this | |
| domain = 'example.com' # <--- and this | |
| adv.crawl(homepage, 'output_file.jl', follow_links=True, | |
| custom_settings={'LOG_FILE': 'output_file.log'}) | |
| crawldf = pd.read_json('output_file.jl', lines=True) | |
| links_df = (pd.DataFrame({ | |
| 'url': crawldf['url'], | |
| 'links_url': crawldf['links_url'].str.split('@@') | |
| }).explode('links_url') | |
| .assign(links_text=crawldf['links_text'].str.split('@@').explode()) | |
| .assign(links_nofollow=crawldf['links_nofollow'].str.split('@@').explode()) | |
| .assign(internal=lambda df: df['links_url'].str.contains(domain))) | |
| external_links = links_df[~links_df['internal'].fillna(False)].drop_duplicates('links_url') | |
| external_urldf = adv.url_to_df(external_links['links_url'].fillna('')) | |
| (external_urldf | |
| ['netloc'] | |
| .value_counts() | |
| .head(40) | |
| .reset_index() | |
| .rename(columns={'index': 'domain', | |
| 'netloc': 'unique links per domain'}) | |
| .assign(_=range(1, 41)) | |
| .set_index('_') | |
| .style.background_gradient(cmap='cividis') | |
| .set_caption(f'<h4>External links on <b>{domain}</b></h4')) |
Author
eliasdabbas
commented
Aug 31, 2022



Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment