Skip to content

Instantly share code, notes, and snippets.

@teldridge11
Created February 12, 2019 20:56
Show Gist options
  • Save teldridge11/cd8d446e3529ad711fc33fa1cd623937 to your computer and use it in GitHub Desktop.
Save teldridge11/cd8d446e3529ad711fc33fa1cd623937 to your computer and use it in GitHub Desktop.
import matplotlib
import seaborn as sns
import pandas as pd
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN
from matplotlib import cm
from datetime import datetime
from dateutil import parser
import json
# Kick off ruby rake task to pull data and store in tmp/data.json file
# Once data is loaded, spawn python process
# Ruby script loops per 15 secs waiting for tmp/anomalies.json to show up
# Python script loads data.json file into array
# analyze data for anomalies
# dump results into json file
# Ruby rake task sees file, stops looping
# Rake task loads anomaly data into db
# Load data
with open('data.json') as f:
data = json.load(f)
cost_data = []
instance_count_data = []
for i,d in enumerate(data):
attributes = d["attributes"]
day = attributes["day"]
instance_type = attributes["instance_type"]
values = d["values"]
number_of_instances = values["number_of_instances"]
compute_cost = values["compute_cost"]
timestamp = int(parser.parse(day).strftime("%s"))
cost_data.append([i,compute_cost])
instance_count_data.append([i,number_of_instances])
# COST ANOMALY DETECTION
# Generate outlier data
raw_data = cost_data
cost_data = pd.DataFrame(cost_data)
cost_data[1] = zscore(cost_data[1])
cost_data["is_outlier"] = cost_data[1].apply(lambda x: x <= -3 or x >= 3)
cost_data[cost_data["is_outlier"]]
# Create data frame
scaler = MinMaxScaler()
dayAndCost = cost_data[[0, 1]]
dayAndCost = scaler.fit_transform(dayAndCost)
dayAndCost = pd.DataFrame(dayAndCost, columns = [0,1])
# Create prediction model for outliers
outlier_detection = DBSCAN(eps = 0.3,metric="euclidean",min_samples = 3,n_jobs = -1)
clusters = outlier_detection.fit_predict(dayAndCost)
# Create and display plot
cmap = cm.get_cmap('Accent')
dayAndCost.plot.scatter(x = 0,y = 1,c = clusters,cmap = cmap,colorbar = False)
# matplotlib.pyplot.show()
# Create anomaly data
anomaly_data = []
cloud = "Azure"
description = "Cost was much higher than normal"
count = 1
severity = "SEVERE"
for i, anomaly in enumerate(clusters):
if anomaly:
anomaly = {}
anomaly["cloud"] = cloud
anomaly["description"] = description
anomaly["count"] = count
anomaly["severity"] = severity
cost_data_point = raw_data[i][1]
prev_cost_data_point = raw_data[i-1][1]
if cost_data_point > prev_cost_data_point:
verb = "increased"
else:
verb = "decreased"
growth_rate = (cost_data_point / prev_cost_data_point) * 100.0
cost_increase = int(round(growth_rate * prev_cost_data_point))
anomaly["impact"] = "Your instance cost " + verb + " " + str(cost_increase) + " dollars in one day"
anomaly_data.append(anomaly)
# INSTANCE COUNT ANOMALIES
# Generate outlier data
raw_instance_count_data = instance_count_data
instance_count_data = pd.DataFrame(instance_count_data)
instance_count_data[1] = zscore(instance_count_data[1])
instance_count_data["is_outlier"] = instance_count_data[1].apply(lambda x: x <= -3 or x >= 3)
instance_count_data[instance_count_data["is_outlier"]]
# Create data frame
scaler = MinMaxScaler()
dayAndCount = instance_count_data[[0, 1]]
dayAndCount = scaler.fit_transform(dayAndCount)
dayAndCount = pd.DataFrame(dayAndCount, columns = [0,1])
# Create prediction model for outliers
outlier_detection = DBSCAN(eps = 0.2,metric="euclidean",min_samples = 3,n_jobs = -1)
clusters = outlier_detection.fit_predict(dayAndCount)
# Create and display plot
cmap = cm.get_cmap('Accent')
dayAndCount.plot.scatter(x = 0,y = 1,c = clusters,cmap = cmap,colorbar = False)
# matplotlib.pyplot.show()
# Create anomaly data
cloud = "Azure"
description = "Instance count was much higher than normal"
count = 1
severity = "SEVERE"
if cloud == "Azure":
term = "VM"
else:
term = "Instance"
for i, anomaly in enumerate(clusters):
if anomaly:
anomaly = {}
anomaly["cloud"] = cloud
anomaly["description"] = description
anomaly["count"] = count
anomaly["severity"] = severity
count_data_point = raw_instance_count_data[i][1]
prev_count_data_point = raw_instance_count_data[i-1][1]
if count_data_point > prev_count_data_point:
verb = "increased"
else:
verb = "decreased"
count_increase = int(round(count_data_point - prev_count_data_point))
anomaly["impact"] = "Your " + term + " count " + verb + " by " + str(count_increase) + " " + term + "s in the last 15 minutes"
anomaly_data.append(anomaly)
# REGION ANOMALY DETECTION
# INSTANCE TYPE ANOMALY DETECTION
# Write anomaly data to anomalies.json
with open('anomalies.json', 'w') as outfile:
json.dump(anomaly_data, outfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment