Skip to content

Instantly share code, notes, and snippets.

View eliasdabbas's full-sized avatar
💭
https://bit.ly/3ZLmvhu

Elias Dabbas eliasdabbas

💭
https://bit.ly/3ZLmvhu
View GitHub Profile
@eliasdabbas
eliasdabbas / incremental_crawling.py
Last active October 20, 2023 12:42
Incremental crawling with advertools. Crawl a set number of pages every time without re-crawling the same pages
import advertools as adv
adv.crawl(
# start crawling from this URL(s):
url_list='https://en.wikipedia.org/wiki/Main_Page',
# save the crawl output to this file:
output_file='/home/user_name/wikipedia_en_crawl.jl',
# Should it follow links?
follow_links=True,
# But don't follow all links, only links that match this regex:
@eliasdabbas
eliasdabbas / meta_tags.py
Created August 12, 2023 11:31
Get all meta tags of a selected URL (every tags under the <head> section of the page)
import requests
from bs4 import BeautifulSoup
import pandas as pd
def meta_tags(url, get_text=['title']):
"""Get all tags under the <head> of `url` with all attributes and values.
This is mainly for exploratory purposes, to discover what is available,
and if there are errors. If you know which tags/attributes you want beforehand
you can easily get them with custom extraction (CSS/XPath selectors).
@eliasdabbas
eliasdabbas / jl_to_parquet.py
Created July 16, 2023 11:01
Convert a jsonlines file to a compressed parquet file (if JSON object have different types e.g. list and scalar in the same column, it converts them to strings)
def jl_to_parquet(jl_filepath, parquet_filepath):
"""Convert a jsonlines crawl file to the parquet format.
Parameters
----------
jl_filepath : str
The path of an existing .jl file.
parquet_fileapth : str
The pather where you want the new file to be saved (ending with .parquet).
"""
@eliasdabbas
eliasdabbas / tableviz.py
Created July 15, 2023 10:01
Visualizing tables with Plotly
import adviz
import plotly.express as px
from plotly.subplots import make_subplots
def category_to_color(categories, colorscale='D3'):
colorscale = eval(f'px.colors.qualitative.{colorscale}')
cat_dict = dict(enumerate(set(categories)))
cat_dict = {v: colorscale[k] for k, v in cat_dict.items()}
return [cat_dict[cat] for cat in categories]
@eliasdabbas
eliasdabbas / crawl_link_summary.py
Created July 14, 2023 08:38
Organize links in an advertools crawl DataFrame
@eliasdabbas
eliasdabbas / redirect_chains.py
Created July 9, 2023 21:52
Get redirect chains from an advertools crawl dataset
def redirect_chains(crawldf):
"""Create a tidy DataFrame for redirects with the columns:
url: All the URLs in the redirect chain.
status: The status code of each URL.
type: "requested", "inermediate", or "crawled".
order: 1, 2, 3... up to the number of urls in the redirect chain.
redirect_times: The number of redirects in the chain (URLs in the chain minus one).
"""
redirect_df = (crawldf[['url', 'status', 'redirect_urls', 'redirect_reasons']]
import advertools as adv
import adviz
# get URLs of the sitemap index
nyt = adv.sitemap_to_df('https://nytimes.com/robots.txt', recursive=False)
# get URLs of the /sitemap.xml.gz sitemap index
nyt_sitemap_index = adv.sitemap_to_df('https://www.nytimes.com/sitemaps/new/sitemap.xml.gz', recursive=False)
nyt_2022 = []
@eliasdabbas
eliasdabbas / status_code_figure.py
Created October 13, 2022 23:30
Visualize a list of HTTP status codes as a treemap of two levels.
from http.client import responses
import pandas as pd
import plotly.express as px
import dash_bootstrap_components as dbc
from dash_bootstrap_templates import load_figure_template
themes = [theme for theme in dir(dbc.themes) if theme[0].isupper()]
load_figure_template(themes=themes)
def status_code_chart(
@eliasdabbas
eliasdabbas / crawl_multiple_header_combinations.py
Created October 13, 2022 09:59
Crawl a bunch of URLs using various combinations of request headers
import advertools as adv
import pandas as pd
pd.options.display.max_columns = None
headers_components = {
'User-agent': [
# Googlebot:
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
# iPhone 13:
import advertools as adv
import pandas as pd
key = 'YOUR_GOOGLE_KEY'
brands = [
'nike',
'adidas',
'puma',
'asics',