hamletbatista’s gists

hamletbatista / indexed_not_indexed_selector.js

Created April 3, 2019 19:23

document.querySelector('#yDmH0d > c-wiz:nth-child(21) > div > div.OoO4Vb > span > div > div.y3IDJd.rFZTte.Fx3kmc > content > div.shSP > div > div > div.LgQiCc.vOSR6b.RVNZdd.qtMyGd > content > div.khfFee > div > div.CC5fre')

hamletbatista / get_url_inspection_into_pandas.py

Created April 4, 2019 00:02

	import pandas as pd

	#example list of urls to inspect
	site_pages = ["https://www.ranksense.com/", "https://www.ranksense.com/how-it-works/",
	"https://www.ranksense.com/pricing/", "https://www.ranksense.com/blog/",
	"https://www.ranksense.com/products/organic-search-ads/feed/",
	"https://www.ranksense.com/additional-ways-to-use-chrome-developer-tools-for-seo/",
	"https://www.ranksense.com/empowering-a-new-generation-of-seos-with-python/"]

	data = asyncio.get_event_loop().run_until_complete(inspect_urls(site_pages))

hamletbatista / collect_training_data.py

Created April 9, 2019 18:36

	import sklearn as sk
	from bs4 import BeautifulSoup

	img_counts = pd.read_csv("/content/gdrive/My Drive/img_sizes.csv", usecols=["url", "img_src", "filesize", "width", "height"])
	form_counts = pd.read_csv("/content/gdrive/My Drive/form_counts.csv", usecols=["url", "form_count", "input_count"])]

	form_counts.head().drop("url", axis=1)
	#outputs example data

	img_counts.drop(["url", "img_src"], axis=1).head()

hamletbatista / prepare_dataset.py

Last active April 9, 2019 18:58

	def img_size_group(size):
	max_size = 50000
	#image size bins
	img_size_groups = [i for i in
	zip(
	[i for i in range(0, max_size, 1000)],
	[i for i in range(1000, max_size, 1000)]
	)
	]

hamletbatista / true_labeling.py

Created April 9, 2019 19:04

	ml_data = form_counts.merge(onehot_img, on="url")

	ml_data.loc[:, 'group'] = "N/A"
	ml_data.loc[ml_data['url'].str.contains(r"./products/.\|./product/."), "group"] = "Products"
	ml_data.loc[ml_data['url'].str.contains(r"/collections(?!./products.)(?!./product.)"), "group"] = "Category"

	#splitting dataset into training and testing
	X_train, X_test, y_train, y_test = train_test_split(ml_data.drop(["group", "url"], axis=1), ml_data['group'], test_size=0.2, random_state=42)

hamletbatista / model_training_and_grid_search.py

Created April 9, 2019 19:15

	names = [
	"Naive Bayes",
	"Linear SVM",
	"Logistic Regression",
	"Random Forest",
	"Multilayer Perceptron"
	]

	classifiers = [
	MultinomialNB(),

hamletbatista / plot_confusion_matrix.py

Created April 9, 2019 19:50

	def plot_confusion_matrix(cm, classes,
	normalize=False,
	title='Confusion matrix',
	cmap=plt.cm.Blues):
	"""
	This function prints and plots the confusion matrix.
	Normalization can be applied by setting `normalize=True`.
	"""
	if normalize:
	cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

hamletbatista / model_evaluation_plot.py

Created April 9, 2019 19:51

	results = pd.DataFrame(rows, columns=["algorithm", "score", "params"])
	results = results.groupby("algorithm").max().reset_index()

	plot_data = []

	bar = go.Bar(
	y = results['score'].tolist(),
	x = results['algorithm'].tolist()
	)

hamletbatista / get_seo_branded_data.py

Created April 20, 2019 18:37

Get branded SEO data using the Semrush API

	import requests
	from urllib.parse import urlencode, urlparse, urlunparse, quote
	import pandas as pd

	def get_seo_branded_data(brand, domain, database="us", export_columns="Ph,Po,Nq,Ur,Tg,Td,Ts", display_limit=10000, display_filter="+\|Ph\|Co\|{brand}"):

	global key

	url_params={"type": "domain_organic",
	"key": key,

hamletbatista / pull_macys_keywords.py

Created April 20, 2019 18:46

	database="us"
	macys="macys.com"
	brand="Tommy Hilfiger"

	macys_df = get_seo_branded_data(brand, macys, export_columns="Ph,Po,Tg") # only keyword, position and traffic

	#we explicitly convert numbers to integers to be able to perform arithmetic operations later
	convert_dict = {'Keyword': str, 'Position': int, 'Traffic': int}

	macys_df = macys_df.astype(convert_dict)

Hamlet Batista hamletbatista