teldridge11 · February 12, 2019 20:56
diff --git a/hackathonpub b/hackathonpub
 import matplotlib
 import seaborn as sns
 import pandas as pd
 from scipy.stats import zscore
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.cluster import DBSCAN
 from matplotlib import cm
 from datetime import datetime
 from dateutil import parser
 import json

 # Kick off ruby rake task to pull data and store in tmp/data.json file
 # Once data is loaded, spawn python process
 # Ruby script loops per 15 secs waiting for tmp/anomalies.json to show up
 # Python script loads data.json file into array
 # analyze data for anomalies
 # dump results into json file
 # Ruby rake task sees file, stops looping
 # Rake task loads anomaly data into db

 # Load data 
 with open('data.json') as f:
 	data = json.load(f)

 cost_data = []
 instance_count_data = []
 for i,d in enumerate(data):
  attributes = d["attributes"]
  day = attributes["day"]
  instance_type = attributes["instance_type"]
  values = d["values"]
  number_of_instances = values["number_of_instances"]
  compute_cost = values["compute_cost"]
  timestamp = int(parser.parse(day).strftime("%s"))
  cost_data.append([i,compute_cost])
  instance_count_data.append([i,number_of_instances])





 # COST ANOMALY DETECTION

 # Generate outlier data
 raw_data = cost_data
 cost_data = pd.DataFrame(cost_data)
 cost_data[1] = zscore(cost_data[1])
 cost_data["is_outlier"] = cost_data[1].apply(lambda x: x <= -3 or x >= 3)
 cost_data[cost_data["is_outlier"]]

 # Create data frame
 scaler = MinMaxScaler()
 dayAndCost = cost_data[[0, 1]]
 dayAndCost = scaler.fit_transform(dayAndCost)
 dayAndCost = pd.DataFrame(dayAndCost, columns = [0,1])

 # Create prediction model for outliers
 outlier_detection = DBSCAN(eps = 0.3,metric="euclidean",min_samples = 3,n_jobs = -1)
 clusters = outlier_detection.fit_predict(dayAndCost)

 # Create and display plot
 cmap = cm.get_cmap('Accent')
 dayAndCost.plot.scatter(x = 0,y = 1,c = clusters,cmap = cmap,colorbar = False)
 # matplotlib.pyplot.show()

 # Create anomaly data
 anomaly_data = []
 cloud = "Azure"
 description = "Cost was much higher than normal"
 count = 1
 severity = "SEVERE"

 for i, anomaly in enumerate(clusters):
 	if anomaly:
 		anomaly = {}
 		anomaly["cloud"] = cloud
 		anomaly["description"] = description
 		anomaly["count"] = count
 		anomaly["severity"] = severity
 		cost_data_point = raw_data[i][1]
 		prev_cost_data_point = raw_data[i-1][1]
 		if cost_data_point > prev_cost_data_point:
 			verb = "increased"
 		else:
 			verb = "decreased"
 		growth_rate = (cost_data_point / prev_cost_data_point) * 100.0
 		cost_increase = int(round(growth_rate * prev_cost_data_point))
 		anomaly["impact"] = "Your instance cost " + verb + " " + str(cost_increase) + " dollars in one day"
 		anomaly_data.append(anomaly)




 # INSTANCE COUNT ANOMALIES

 # Generate outlier data
 raw_instance_count_data = instance_count_data
 instance_count_data = pd.DataFrame(instance_count_data)
 instance_count_data[1] = zscore(instance_count_data[1])
 instance_count_data["is_outlier"] = instance_count_data[1].apply(lambda x: x <= -3 or x >= 3)
 instance_count_data[instance_count_data["is_outlier"]]

 # Create data frame
 scaler = MinMaxScaler()
 dayAndCount = instance_count_data[[0, 1]]
 dayAndCount = scaler.fit_transform(dayAndCount)
 dayAndCount = pd.DataFrame(dayAndCount, columns = [0,1])

 # Create prediction model for outliers
 outlier_detection = DBSCAN(eps = 0.2,metric="euclidean",min_samples = 3,n_jobs = -1)
 clusters = outlier_detection.fit_predict(dayAndCount)

 # Create and display plot
 cmap = cm.get_cmap('Accent')
 dayAndCount.plot.scatter(x = 0,y = 1,c = clusters,cmap = cmap,colorbar = False)
 # matplotlib.pyplot.show()

 # Create anomaly data
 cloud = "Azure"
 description = "Instance count was much higher than normal"
 count = 1
 severity = "SEVERE"
 if cloud == "Azure":
 	term = "VM"
 else:
 	term = "Instance"

 for i, anomaly in enumerate(clusters):
 	if anomaly:
 		anomaly = {}
 		anomaly["cloud"] = cloud
 		anomaly["description"] = description
 		anomaly["count"] = count
 		anomaly["severity"] = severity
 		count_data_point = raw_instance_count_data[i][1]
 		prev_count_data_point = raw_instance_count_data[i-1][1]
 		if count_data_point > prev_count_data_point:
 			verb = "increased"
 		else:
 			verb = "decreased"
 		count_increase = int(round(count_data_point - prev_count_data_point))
 		anomaly["impact"] = "Your " + term + " count " + verb + " by " + str(count_increase) + " " + term + "s in the last 15 minutes"
 		anomaly_data.append(anomaly)





 # REGION ANOMALY DETECTION





 # INSTANCE TYPE ANOMALY DETECTION






 # Write anomaly data to anomalies.json
 with open('anomalies.json', 'w') as outfile:
    json.dump(anomaly_data, outfile)
	import matplotlib
	import seaborn as sns
	import pandas as pd
	from scipy.stats import zscore
	from sklearn.preprocessing import MinMaxScaler
	from sklearn.cluster import DBSCAN
	from matplotlib import cm
	from datetime import datetime
	from dateutil import parser
	import json

	# Kick off ruby rake task to pull data and store in tmp/data.json file
	# Once data is loaded, spawn python process
	# Ruby script loops per 15 secs waiting for tmp/anomalies.json to show up
	# Python script loads data.json file into array
	# analyze data for anomalies
	# dump results into json file
	# Ruby rake task sees file, stops looping
	# Rake task loads anomaly data into db

	# Load data
	with open('data.json') as f:
	data = json.load(f)

	cost_data = []
	instance_count_data = []
	for i,d in enumerate(data):
	attributes = d["attributes"]
	day = attributes["day"]
	instance_type = attributes["instance_type"]
	values = d["values"]
	number_of_instances = values["number_of_instances"]
	compute_cost = values["compute_cost"]
	timestamp = int(parser.parse(day).strftime("%s"))
	cost_data.append([i,compute_cost])
	instance_count_data.append([i,number_of_instances])





	# COST ANOMALY DETECTION

	# Generate outlier data
	raw_data = cost_data
	cost_data = pd.DataFrame(cost_data)
	cost_data[1] = zscore(cost_data[1])
	cost_data["is_outlier"] = cost_data[1].apply(lambda x: x <= -3 or x >= 3)
	cost_data[cost_data["is_outlier"]]

	# Create data frame
	scaler = MinMaxScaler()
	dayAndCost = cost_data[[0, 1]]
	dayAndCost = scaler.fit_transform(dayAndCost)
	dayAndCost = pd.DataFrame(dayAndCost, columns = [0,1])

	# Create prediction model for outliers
	outlier_detection = DBSCAN(eps = 0.3,metric="euclidean",min_samples = 3,n_jobs = -1)
	clusters = outlier_detection.fit_predict(dayAndCost)

	# Create and display plot
	cmap = cm.get_cmap('Accent')
	dayAndCost.plot.scatter(x = 0,y = 1,c = clusters,cmap = cmap,colorbar = False)
	# matplotlib.pyplot.show()

	# Create anomaly data
	anomaly_data = []
	cloud = "Azure"
	description = "Cost was much higher than normal"
	count = 1
	severity = "SEVERE"

	for i, anomaly in enumerate(clusters):
	if anomaly:
	anomaly = {}
	anomaly["cloud"] = cloud
	anomaly["description"] = description
	anomaly["count"] = count
	anomaly["severity"] = severity
	cost_data_point = raw_data[i][1]
	prev_cost_data_point = raw_data[i-1][1]
	if cost_data_point > prev_cost_data_point:
	verb = "increased"
	else:
	verb = "decreased"
	growth_rate = (cost_data_point / prev_cost_data_point) * 100.0
	cost_increase = int(round(growth_rate * prev_cost_data_point))
	anomaly["impact"] = "Your instance cost " + verb + " " + str(cost_increase) + " dollars in one day"
	anomaly_data.append(anomaly)




	# INSTANCE COUNT ANOMALIES

	# Generate outlier data
	raw_instance_count_data = instance_count_data
	instance_count_data = pd.DataFrame(instance_count_data)
	instance_count_data[1] = zscore(instance_count_data[1])
	instance_count_data["is_outlier"] = instance_count_data[1].apply(lambda x: x <= -3 or x >= 3)
	instance_count_data[instance_count_data["is_outlier"]]

	# Create data frame
	scaler = MinMaxScaler()
	dayAndCount = instance_count_data[[0, 1]]
	dayAndCount = scaler.fit_transform(dayAndCount)
	dayAndCount = pd.DataFrame(dayAndCount, columns = [0,1])

	# Create prediction model for outliers
	outlier_detection = DBSCAN(eps = 0.2,metric="euclidean",min_samples = 3,n_jobs = -1)
	clusters = outlier_detection.fit_predict(dayAndCount)

	# Create and display plot
	cmap = cm.get_cmap('Accent')
	dayAndCount.plot.scatter(x = 0,y = 1,c = clusters,cmap = cmap,colorbar = False)
	# matplotlib.pyplot.show()

	# Create anomaly data
	cloud = "Azure"
	description = "Instance count was much higher than normal"
	count = 1
	severity = "SEVERE"
	if cloud == "Azure":
	term = "VM"
	else:
	term = "Instance"

	for i, anomaly in enumerate(clusters):
	if anomaly:
	anomaly = {}
	anomaly["cloud"] = cloud
	anomaly["description"] = description
	anomaly["count"] = count
	anomaly["severity"] = severity
	count_data_point = raw_instance_count_data[i][1]
	prev_count_data_point = raw_instance_count_data[i-1][1]
	if count_data_point > prev_count_data_point:
	verb = "increased"
	else:
	verb = "decreased"
	count_increase = int(round(count_data_point - prev_count_data_point))
	anomaly["impact"] = "Your " + term + " count " + verb + " by " + str(count_increase) + " " + term + "s in the last 15 minutes"
	anomaly_data.append(anomaly)





	# REGION ANOMALY DETECTION





	# INSTANCE TYPE ANOMALY DETECTION






	# Write anomaly data to anomalies.json
	with open('anomalies.json', 'w') as outfile:
	json.dump(anomaly_data, outfile)