Skip to content

Instantly share code, notes, and snippets.

@chyld
Created November 8, 2019 18:36
Show Gist options
  • Save chyld/984c74ef9cbb012aae8052f20f9b3d64 to your computer and use it in GitHub Desktop.
Save chyld/984c74ef9cbb012aae8052f20f9b3d64 to your computer and use it in GitHub Desktop.
def anomalyScores(originalDF, reducedDF):
loss = np.sum((np.array(originalDF)-np.array(reducedDF))**2, axis=1)
loss = pd.Series(data=loss,index=originalDF.index)
loss = (loss-np.min(loss))/(np.max(loss)-np.min(loss))
# loss is between 0 and 1 ... 1 being highest reconstruction error
return loss
def plotResults(trueLabels, anomalyScores, returnPreds = False):
preds = pd.concat([trueLabels, anomalyScores], axis=1)
preds.columns = ['trueLabel', 'anomalyScore']
precision, recall, thresholds = precision_recall_curve(preds['trueLabel'],preds['anomalyScore'])
average_precision = average_precision_score(preds['trueLabel'],preds['anomalyScore'])
plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(average_precision))
fpr, tpr, thresholds = roc_curve(preds['trueLabel'], preds['anomalyScore'])
areaUnderROC = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='r', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic: \
Area under the curve = {0:0.2f}'.format(areaUnderROC))
plt.legend(loc="lower right")
plt.show()
if returnPreds==True:
return preds
def scatterPlot(xDF, yDF, algoName):
tempDF = pd.DataFrame(data=xDF.loc[:,0:1], index=xDF.index)
tempDF = pd.concat((tempDF,yDF), axis=1, join="inner")
tempDF.columns = ["First Vector", "Second Vector", "Label"]
sns.lmplot(x="First Vector", y="Second Vector", hue="Label", \
data=tempDF, fit_reg=False)
ax = plt.gca()
ax.set_title("Separation of Observations using "+algoName)
def print_stats(preds):
preds.sort_values(by="anomalyScore",ascending=False,inplace=True)
cutoff = preds.trueLabel.sum()
predsTop = preds[:cutoff]
# out of everything i said was fraud, how much is actually fraud
print("Precision: ",np.round(predsTop.anomalyScore[predsTop.trueLabel==1].count()/cutoff,2))
# out of everything that is fraud, how much did i catch
print("Recall: ",np.round(predsTop.anomalyScore[predsTop.trueLabel==1].count()/y_train.sum(),2))
print("Fraud Caught out of", cutoff, "Cases:",
predsTop.trueLabel.sum(),
"Percentage caught:",
predsTop.trueLabel.sum() / cutoff)
df = pd.read_csv("datasets/credit_card_data/cc.csv")
y = df.Class
X = df.drop(columns=['Class'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2018, stratify=y)
scalar = pp.StandardScaler().fit(X_train)
X_train = pd.DataFrame(scalar.transform(X_train), columns=X.columns, index=X_train.index)
X_test = pd.DataFrame(scalar.transform(X_test), columns=X.columns, index=X_test.index)
@chyld
Copy link
Author

chyld commented Nov 8, 2019

ok

@chyld
Copy link
Author

chyld commented Nov 8, 2019

X_train_PCA = pca.fit_transform(X_train)
X_train_PCA = pd.DataFrame(data=X_train_PCA, index=X_train.index)
X_train_PCA_inverse = pca.inverse_transform(X_train_PCA)
X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse, index=X_train.index)
scatterPlot(X_train_PCA, y_train, "PCA")```

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment