Skip to content

Instantly share code, notes, and snippets.

View eliasdabbas's full-sized avatar
💭
https://adver.tools/

Elias Dabbas eliasdabbas

💭
https://adver.tools/
View GitHub Profile
@eliasdabbas
eliasdabbas / direct_labeling_plotly_express.py
Last active November 6, 2024 12:25
Direct labeling with plotly.express
import plotly.express as px
fig = px.line(
stocks,
x='date',
template='plotly_white',
y=stocks.columns[1:],
color_discrete_sequence=cat_scale,
labels={'y': 'stock price'},
height=600,)
@eliasdabbas
eliasdabbas / cartogram.py
Last active November 6, 2024 12:07
Create a cartogram by assigning size to flags (or two letter symbols) without displaying map lines
import plotly.express as px
import adviz
import numpy as np
import pandas as pd
population = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population')
pop = population[0][['Location', 'Population']].copy()
flags = []
for country in pop['Location']:
try:
@eliasdabbas
eliasdabbas / filter_non_200_status_codes.py
Created June 2, 2024 19:17
Filter non 200 status codes on a daily basis
import os
import datetime
import pandas as pd
today = datetime.datetime.now(datetime.UTC).strftime('%Y_%m_%d')
url_status_time = pd.concat(
pd.read_json(f'/path/to/status_codes/{file}',
lines=True)
@eliasdabbas
eliasdabbas / daily_status_code.py
Created June 1, 2024 20:00
Setting up a daily status code checker
import datetime
import advertools as adv
today = datetime.datetime.now(datetime.UTC).strftime("%Y_%m_%d")
sitemap = adv.sitemap_to_df("https://example/sitemap.xml")
adv.crawl_headers(
sitemap["loc"],
@eliasdabbas
eliasdabbas / create_list_crawler_for_sheets.py
Created May 22, 2024 11:19
Create and SEO crawler in list mode working on Google Sheets
import pandas as pd
xpath = pd.read_html('https://advertools.readthedocs.io/en/master/advertools.code_recipes.spider_strategies.html')
df = xpath[2].iloc[:, [1, 2]]
df = df.assign(expression=[f'=textjoin("@@",100,IMPORTXML(A2,"{expression}"))' for expression in df['XPath Expression']])
display(df[['Suggested Name', 'expression']].T)
df[['Suggested Name', 'expression']].T.to_clipboard(index=False)
@eliasdabbas
eliasdabbas / url_to_html_sitemap.py
Last active April 16, 2024 10:03
Create an HTML sitemap for a list of URLs and their anchor text
# Create anchors for all letters:
import string
print('<h3>' + '&nbsp;&nbsp;&nbsp;&nbsp;'.join([f'<a href="#{letter}">{letter}</a>' for letter in string.ascii_uppercase]) + '</h3>')
# Assuming you have a DataFrame with the columns "full_name" and "loc":
for letter in string.ascii_uppercase:
df = players_df[players_static['full_name'].str[0].eq(letter)]
print()
@eliasdabbas
eliasdabbas / url_to_xml_sitemap.py
Created April 15, 2024 12:15
Convert a list of URLs to an XML sitemap
import datetime
import pandas as pd
lastmod = datetime.datetime.now(datetime.UTC).strftime('%Y-%m-%d')
url_list = [
'https://nbastats.pro/player/Eric_Moreland',
'https://nbastats.pro/player/Marc_Iavaroni',
'https://nbastats.pro/player/Keith_Tower',
'https://nbastats.pro/player/Hakeem_Olajuwon',
'https://nbastats.pro/player/Mike_Price',
'https://nbastats.pro/player/Doug_Collins',
@eliasdabbas
eliasdabbas / running_crawls.py
Last active November 2, 2023 13:33
Get a summary of the currently running crawl jobs (using the advertools crawler)
from subprocess import run
from functools import partial
run = partial(run, text=True, capture_output=True)
def running_crawls():
"""Get details of currently running spiders.
Get a DataFrame showing the following details:
@eliasdabbas
eliasdabbas / news_headlines_automated.py
Created October 21, 2023 14:14
Get the main headline story from the homepages of new websites.
import advertools as adv
url_xpath_selectors = {
'https://www.ft.com': ('main_story_headline', '//span[contains(@class, "text text--color-black text-display--scale-7 text--weight-500")]/text()'),
'https://www.nytimes.com': ('main_story_headline', '//h3[@class="indicate-hover css-si8ren"]/text()'),
'https://www.economist.com': ('main_story_headline', '//a[@data-analytics="top_stories:headline_1"]/text()'),
'https://edition.cnn.com': ('main_story_headline', '//h2[@class="container__title_url-text container_lead-package__title_url-text"]/text()'),
'https://www.nbcnews.com': ('main_story_headline', '//h2[@class="storyline__headline founders-cond fw6 important large headlineOnly"]/text()'),
'https://www.bbc.com': ('main_story_headline', '//a[@rev="hero1|headline"]/text()'),
'https://www.foxnews.com': ('main_story_headline', '(//header[@class="info-header"])[1]//a/text()'),
@eliasdabbas
eliasdabbas / incremental_crawling.py
Last active October 20, 2023 12:42
Incremental crawling with advertools. Crawl a set number of pages every time without re-crawling the same pages
import advertools as adv
adv.crawl(
# start crawling from this URL(s):
url_list='https://en.wikipedia.org/wiki/Main_Page',
# save the crawl output to this file:
output_file='/home/user_name/wikipedia_en_crawl.jl',
# Should it follow links?
follow_links=True,
# But don't follow all links, only links that match this regex: