Hamlet Batista hamletbatista

hamletbatista / rangediff

Created November 9, 2017 21:54

A custom Google Sheets function that compares two data sets with a pair of related values, one textual and one numeric

	function is_valid_range_(r) {
	return (r.length > 0 && r[0].length === 2);
	}

	function get_range_values_(r) {
	var values = []
	for(var i=0;i<r.length;i++) {
	values.push({ text: r[i][0].trim(), count: r[i][1] });
	}
	return values;

hamletbatista / read_sitemap_index.py

Created February 27, 2019 21:33

Read Sitemap URLs from XML Sitemap Index

	sitemap_index_url="https://www.searchenginejournal.com/sitemap_index.xml"

	from bs4 import BeautifulSoup
	import requests

	sitemap_index = {}

	r = requests.get(sitemap_index_url)
	xml = r.text

hamletbatista / read_xml_sitemaps.py

Created February 27, 2019 21:51

Read URLs from XML Sitemap

	sitemaps = {}

	for (sitemap_url, lasmod) in sitemap_index.items():
	if(sitemap_url.find("post") > 0):
	print(sitemap_url)

	if 1: # for testing
	r = requests.get(sitemap_url)
	xml = r.text

hamletbatista / load_sitemap_to_pandas.py

Last active March 7, 2019 23:05

	import pandas as pd
	print(pd.__version__) #should be 0.23 or later
	df = pd.DataFrame.from_dict(sitemaps, orient="index", columns=['lastmod'])
	df.head(10)

hamletbatista / prepare_word_cloud.py

Created February 27, 2019 22:03

	from collections import Counter
	import re

	import nltk
	from nltk.corpus import stopwords

	nltk.download('stopwords')

	from urllib.parse import urlparse

hamletbatista / create_word_cloud.py

Created February 27, 2019 22:06

	cnt=Counter()
	english_stopwords = set(stopwords.words('english'))

	for path in df.path:
	words = re.split("[-/]", path)
	for word in words:
	if len(word) > 0 and word not in english_stopwords and not word.isdigit():
	cnt[word] += 1

	cnt.most_common(25)

hamletbatista / create_visual_word_cloud.py

Created February 27, 2019 22:09

	from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
	import matplotlib.pyplot as plt

	word_cloud = [x[0] for x in cnt.most_common(25)]

	word_cloud_obj = WordCloud(max_words=25, background_color="white").generate(" ".join(word_cloud))

	#word_cloud_obj = WordCloud().generate(" ".join(word_cloud)) #default with ugly black background

	plt.imshow(word_cloud_obj, interpolation='bilinear')

hamletbatista / add_word_cloud_to_df.py

Created February 27, 2019 22:12

hamletbatista / splitting_google_urls_by_1k.py

Created February 27, 2019 22:17

	google_df = df[df["category"] == "google"]

	first = google_df[:1000]
	second = google_df[1000:2000]
	third = google_df[2000:3000]
	last = google_df[3000:]

hamletbatista / fake_transaction_pages.py

Created February 27, 2019 22:21

	high_value_pages=df[df.path.str.contains("adwords\|facebook\|ads\|media", regex=True)]

	import numpy as np

	high_value_pages["fake_transactions"]=np.random.randint(1, 200, high_value_pages.shape[0])

	high_value_pages=high_value_pages.reset_index()

	fake_transaction_pages=high_value_pages[["path", "fake_transactions"]]