Skip to content

Instantly share code, notes, and snippets.

View eliasdabbas's full-sized avatar
💭
https://github.com/eliasdabbas/langchain-advertools

Elias Dabbas eliasdabbas

💭
https://github.com/eliasdabbas/langchain-advertools
View GitHub Profile
@eliasdabbas
eliasdabbas / company_marketcap_interactive_scatter.py
Last active June 11, 2022 12:40
Interactive emailable HTML chart of top 500 companies. Users can select which countries to display
import plotly.express as px
import pandas as pd
import requests
dflist = []
for i in range(1, 6):
resp = requests.get(f'https://companiesmarketcap.com/page/{i}/')
df = pd.read_html(resp.text)[0]
dflist.append(df)
import plotly.express as px
def treemap(traffic_df, metric='Users', path=['Medium', 'Source']):
"""Make in interactive treemap for two data dimensions/levels.
Parameters:
-----------
traffic_df : A DataFrame containing two dimensions, and one or more metrics
@eliasdabbas
eliasdabbas / dress_serp_heatmap.py
Last active April 29, 2023 19:17
Dress SERP heat-map: "dress type styles" and "shop dress type". 40 types. 4 countries: US, UK, CA, AU
import advertools as adv
import pandas as pd
import plotly
import plotly.graph_objects as go
pd.options.display.max_columns = None
cx = 'YOUR_CSE_ID'
key = 'YOUR_GOOGLE_DEV_KEY'
import advertools as adv
import pandas as pd
pd.options.display.max_columns = None
# Copied from https://en.wikipedia.org/wiki/List_of_cancer_types
cancers = {
"Chondrosarcoma": "Bone and muscle sarcoma" ,
"Ewing's sarcoma": "Bone and muscle sarcoma" ,
@eliasdabbas
eliasdabbas / serp_heatmap.py
Last active February 2, 2024 22:58
Create a heatmap of SERPs, using a table with columns: "keyword", "rank", and "domain"
import plotly.graph_objects as go
import pandas as pd
def serp_heatmap(df, num_domains=10, select_domain=None):
df = df.rename(columns={'domain': 'displayLink',
'searchTerms': 'keyword'})
top_domains = df['displayLink'].value_counts()[:num_domains].index.tolist()
top_domains = df['displayLink'].value_counts()[:num_domains].index.tolist()
top_df = df[df['displayLink'].isin(top_domains) & df['displayLink'].ne('')]
@eliasdabbas
eliasdabbas / crawl_multiple_sites.py
Last active April 27, 2022 08:56
Crawl multiple websites with one for loop, while saving the output, logs, and job status separately for each website. Resume crawling any time simply be re-running the same code
from urllib.parse import urlsplit
import advertools as adv
sites = [
'https://www.who.int',
'https://www.nytimes.com',
'https://www.washingtonpost.com',
]
from unicodedata import lookup
def flag(cc):
l1 = lookup(f'REGIONAL INDICATOR SYMBOL LETTER {cc[0]}')
l2 = lookup(f'REGIONAL INDICATOR SYMBOL LETTER {cc[1]}')
return l1 + l2
import datetime
import advertools as adv
import pandas as pd
stopwords = ['to', 'of', 'the', 'in', 'for', 'and', 'on', 'a', 'as', 'with',
'from', 'over', 'is', 'at', '—', '-', 'be', '2022', '–', 'it', 'by',
'we', 'why', 'but', 'my', 'how', 'not', 'an', 'are', 'no', 'go',
'your', 'up', 'his']
@eliasdabbas
eliasdabbas / robots_sitemaps_urls_wordfreq.sh
Last active April 6, 2022 20:35
Fetch robots.txt file, get relevant XML sitemap, extract and split URLs, count words in article titles. Watch for more details: https://bit.ly/3HMZC0A
# pip install advertools==0.14.0a7
# get the robots.txt file, save to csv:
advertools robots --url https://www.economist.com/robots.txt econ_robots.csv
# find lines that start with sitemap, save to variable sitemap_url
sitemap_url=$(grep ^sitemap -i econ_robots.csv | cut -d , -f 2)
# get the sitemap index file without downloading the sub-sitemaps (not recursive),
advertools sitemaps $sitemap_url econ_sitemap.csv --recursive 0