Skip to content

Instantly share code, notes, and snippets.

View eliasdabbas's full-sized avatar
💭
https://github.com/eliasdabbas/langchain-advertools

Elias Dabbas eliasdabbas

💭
https://github.com/eliasdabbas/langchain-advertools
View GitHub Profile
# !pip install --upgrade transformers plotly pandas
import plotly.graph_objects as go
import pandas as pd
pd.options.display.max_columns = None
from transformers import pipeline
unmasker = pipeline('fill-mask', model='bert-base-uncased')
results = []
cars = ['mercedes', 'audi', 'bmw', 'volkswagen', 'ford', 'toyota',
@eliasdabbas
eliasdabbas / score_links.py
Last active September 20, 2022 12:26
Score internal links using two columns of "Source" and "Destination". This calculates various link importance metrics link degree centrality, betweenness centrality and PageRank.
@eliasdabbas
eliasdabbas / get_bot_ip_addresses.py
Last active August 11, 2025 18:28
Get the most up-to-date list of IP addresses for crawler bots, belonging to Google and Bing.
import ipaddress
import requests
import pandas as pd
def bot_ip_addresses():
bots_urls = {
'google': 'https://developers.google.com/search/apis/ipranges/googlebot.json',
'bing': 'https://www.bing.com/toolbox/bingbot.json'
}
@eliasdabbas
eliasdabbas / to_crawl_or_not_to_crawl.py
Created December 19, 2021 19:21
Check if a given URL will be crawled or not given a set of conditions.
from urllib.parse import urlsplit, parse_qs
import re
def crawl_or_not(url,
exclude_url_params=None,
include_url_params=None,
include_url_pattern=None,
exclude_url_pattern=None):
"""Check if ``url`` will be crawled or not given the supplied conditions.
@eliasdabbas
eliasdabbas / user_agent_to_data_frame.py
Created November 30, 2021 16:11
From a list of user agents to a DataFrame of parsed UA's
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)',
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
<? xml version = "1.0" encoding = "UTF-8"?>
<urlset xmlns = "http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns: xhtml = "http://www.w3.org/1999/xhtml">
<url>
<loc> http://www.example.com/ </loc>
<lastmod> 2005-01-01 </lastmod>