Skip to content

Instantly share code, notes, and snippets.

View hamletbatista's full-sized avatar

Hamlet Batista hamletbatista

View GitHub Profile
document.querySelector('#yDmH0d > c-wiz:nth-child(21) > div > div.OoO4Vb > span > div > div.y3IDJd.rFZTte.Fx3kmc > content > div.shSP > div > div > div.LgQiCc.vOSR6b.RVNZdd.qtMyGd > content > div.khfFee > div > div.CC5fre')
import pandas as pd
#example list of urls to inspect
site_pages = ["https://www.ranksense.com/", "https://www.ranksense.com/how-it-works/",
"https://www.ranksense.com/pricing/", "https://www.ranksense.com/blog/",
"https://www.ranksense.com/products/organic-search-ads/feed/",
"https://www.ranksense.com/additional-ways-to-use-chrome-developer-tools-for-seo/",
"https://www.ranksense.com/empowering-a-new-generation-of-seos-with-python/"]
data = asyncio.get_event_loop().run_until_complete(inspect_urls(site_pages))
import sklearn as sk
from bs4 import BeautifulSoup
img_counts = pd.read_csv("/content/gdrive/My Drive/img_sizes.csv", usecols=["url", "img_src", "filesize", "width", "height"])
form_counts = pd.read_csv("/content/gdrive/My Drive/form_counts.csv", usecols=["url", "form_count", "input_count"])]
form_counts.head().drop("url", axis=1)
#outputs example data
img_counts.drop(["url", "img_src"], axis=1).head()
def img_size_group(size):
max_size = 50000
#image size bins
img_size_groups = [i for i in
zip(
[i for i in range(0, max_size, 1000)],
[i for i in range(1000, max_size, 1000)]
)
]
ml_data = form_counts.merge(onehot_img, on="url")
ml_data.loc[:, 'group'] = "N/A"
ml_data.loc[ml_data['url'].str.contains(r".*/products/.*|.*/product/.*"), "group"] = "Products"
ml_data.loc[ml_data['url'].str.contains(r"/collections(?!.*/products.*)(?!.*/product.*)"), "group"] = "Category"
#splitting dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(ml_data.drop(["group", "url"], axis=1), ml_data['group'], test_size=0.2, random_state=42)
names = [
"Naive Bayes",
"Linear SVM",
"Logistic Regression",
"Random Forest",
"Multilayer Perceptron"
]
classifiers = [
MultinomialNB(),
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
results = pd.DataFrame(rows, columns=["algorithm", "score", "params"])
results = results.groupby("algorithm").max().reset_index()
plot_data = []
bar = go.Bar(
y = results['score'].tolist(),
x = results['algorithm'].tolist()
)
@hamletbatista
hamletbatista / get_seo_branded_data.py
Created April 20, 2019 18:37
Get branded SEO data using the Semrush API
import requests
from urllib.parse import urlencode, urlparse, urlunparse, quote
import pandas as pd
def get_seo_branded_data(brand, domain, database="us", export_columns="Ph,Po,Nq,Ur,Tg,Td,Ts", display_limit=10000, display_filter="+|Ph|Co|{brand}"):
global key
url_params={"type": "domain_organic",
"key": key,
database="us"
macys="macys.com"
brand="Tommy Hilfiger"
macys_df = get_seo_branded_data(brand, macys, export_columns="Ph,Po,Tg") # only keyword, position and traffic
#we explicitly convert numbers to integers to be able to perform arithmetic operations later
convert_dict = {'Keyword': str, 'Position': int, 'Traffic': int}
macys_df = macys_df.astype(convert_dict)