This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import advertools as adv | |
adv.crawl( | |
# start crawling from this URL(s): | |
url_list='https://en.wikipedia.org/wiki/Main_Page', | |
# save the crawl output to this file: | |
output_file='/home/user_name/wikipedia_en_crawl.jl', | |
# Should it follow links? | |
follow_links=True, | |
# But don't follow all links, only links that match this regex: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def jl_to_parquet(jl_filepath, parquet_filepath): | |
"""Convert a jsonlines crawl file to the parquet format. | |
Parameters | |
---------- | |
jl_filepath : str | |
The path of an existing .jl file. | |
parquet_fileapth : str | |
The pather where you want the new file to be saved (ending with .parquet). | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import adviz | |
import plotly.express as px | |
from plotly.subplots import make_subplots | |
def category_to_color(categories, colorscale='D3'): | |
colorscale = eval(f'px.colors.qualitative.{colorscale}') | |
cat_dict = dict(enumerate(set(categories))) | |
cat_dict = {v: colorscale[k] for k, v in cat_dict.items()} | |
return [cat_dict[cat] for cat in categories] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def link_summary(crawldf, internal_url_regex=None): | |
"""Get a DataFrame summary of links from a crawl DataFrame | |
Parameters: | |
----------- | |
crawldf : DataFrame | |
A DataFrame of a website crawled with advertools. | |
internal_url_regex : str | |
A regular expression for identifying if a link is internal or not. | |
For example if your website is example.com, this would be "example.com". |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def redirect_chains(crawldf): | |
"""Create a tidy DataFrame for redirects with the columns: | |
url: All the URLs in the redirect chain. | |
status: The status code of each URL. | |
type: "requested", "inermediate", or "crawled". | |
order: 1, 2, 3... up to the number of urls in the redirect chain. | |
redirect_times: The number of redirects in the chain (URLs in the chain minus one). | |
""" | |
redirect_df = (crawldf[['url', 'status', 'redirect_urls', 'redirect_reasons']] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import advertools as adv | |
import adviz | |
# get URLs of the sitemap index | |
nyt = adv.sitemap_to_df('https://nytimes.com/robots.txt', recursive=False) | |
# get URLs of the /sitemap.xml.gz sitemap index | |
nyt_sitemap_index = adv.sitemap_to_df('https://www.nytimes.com/sitemaps/new/sitemap.xml.gz', recursive=False) | |
nyt_2022 = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from http.client import responses | |
import pandas as pd | |
import plotly.express as px | |
import dash_bootstrap_components as dbc | |
from dash_bootstrap_templates import load_figure_template | |
themes = [theme for theme in dir(dbc.themes) if theme[0].isupper()] | |
load_figure_template(themes=themes) | |
def status_code_chart( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import advertools as adv | |
import pandas as pd | |
pd.options.display.max_columns = None | |
headers_components = { | |
'User-agent': [ | |
# Googlebot: | |
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', | |
# iPhone 13: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import advertools as adv | |
import pandas as pd | |
key = 'YOUR_GOOGLE_KEY' | |
brands = [ | |
'nike', | |
'adidas', | |
'puma', | |
'asics', |