This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# !pip install --upgrade transformers plotly pandas | |
import plotly.graph_objects as go | |
import pandas as pd | |
pd.options.display.max_columns = None | |
from transformers import pipeline | |
unmasker = pipeline('fill-mask', model='bert-base-uncased') | |
results = [] | |
cars = ['mercedes', 'audi', 'bmw', 'volkswagen', 'ford', 'toyota', |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import networkx as nx | |
import pandas as pd | |
def score_links(links_file, domain): | |
"""Score a network on links based on their importance and centrality. | |
links_file: Path to the file having the links (needs a "Source" and | |
"Destination" columns) e.g. ScreamingFrog's outlinks file. | |
domain: Filter all links, making sure they all point to the domain you want. | |
""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ipaddress | |
import requests | |
import pandas as pd | |
def bot_ip_addresses(): | |
bots_urls = { | |
'google': 'https://developers.google.com/search/apis/ipranges/googlebot.json', | |
'bing': 'https://www.bing.com/toolbox/bingbot.json' | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.parse import urlsplit, parse_qs | |
import re | |
def crawl_or_not(url, | |
exclude_url_params=None, | |
include_url_params=None, | |
include_url_pattern=None, | |
exclude_url_pattern=None): | |
"""Check if ``url`` will be crawled or not given the supplied conditions. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
user_agents = [ | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0', | |
'Mozilla/5.0 (Windows NT 6.1; rv:64.0) Gecko/20100101 Firefox/64.0', | |
'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', | |
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0', | |
'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)', | |
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<? xml version = "1.0" encoding = "UTF-8"?> | |
<urlset xmlns = "http://www.sitemaps.org/schemas/sitemap/0.9" | |
xmlns: xhtml = "http://www.w3.org/1999/xhtml"> | |
<url> | |
<loc> http://www.example.com/ </loc> | |
<lastmod> 2005-01-01 </lastmod> |
NewerOlder