Created
November 8, 2019 18:36
-
-
Save chyld/984c74ef9cbb012aae8052f20f9b3d64 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def anomalyScores(originalDF, reducedDF): | |
loss = np.sum((np.array(originalDF)-np.array(reducedDF))**2, axis=1) | |
loss = pd.Series(data=loss,index=originalDF.index) | |
loss = (loss-np.min(loss))/(np.max(loss)-np.min(loss)) | |
# loss is between 0 and 1 ... 1 being highest reconstruction error | |
return loss | |
def plotResults(trueLabels, anomalyScores, returnPreds = False): | |
preds = pd.concat([trueLabels, anomalyScores], axis=1) | |
preds.columns = ['trueLabel', 'anomalyScore'] | |
precision, recall, thresholds = precision_recall_curve(preds['trueLabel'],preds['anomalyScore']) | |
average_precision = average_precision_score(preds['trueLabel'],preds['anomalyScore']) | |
plt.step(recall, precision, color='k', alpha=0.7, where='post') | |
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k') | |
plt.xlabel('Recall') | |
plt.ylabel('Precision') | |
plt.ylim([0.0, 1.05]) | |
plt.xlim([0.0, 1.0]) | |
plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(average_precision)) | |
fpr, tpr, thresholds = roc_curve(preds['trueLabel'], preds['anomalyScore']) | |
areaUnderROC = auc(fpr, tpr) | |
plt.figure() | |
plt.plot(fpr, tpr, color='r', lw=2, label='ROC curve') | |
plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--') | |
plt.xlim([0.0, 1.0]) | |
plt.ylim([0.0, 1.05]) | |
plt.xlabel('False Positive Rate') | |
plt.ylabel('True Positive Rate') | |
plt.title('Receiver operating characteristic: \ | |
Area under the curve = {0:0.2f}'.format(areaUnderROC)) | |
plt.legend(loc="lower right") | |
plt.show() | |
if returnPreds==True: | |
return preds | |
def scatterPlot(xDF, yDF, algoName): | |
tempDF = pd.DataFrame(data=xDF.loc[:,0:1], index=xDF.index) | |
tempDF = pd.concat((tempDF,yDF), axis=1, join="inner") | |
tempDF.columns = ["First Vector", "Second Vector", "Label"] | |
sns.lmplot(x="First Vector", y="Second Vector", hue="Label", \ | |
data=tempDF, fit_reg=False) | |
ax = plt.gca() | |
ax.set_title("Separation of Observations using "+algoName) | |
def print_stats(preds): | |
preds.sort_values(by="anomalyScore",ascending=False,inplace=True) | |
cutoff = preds.trueLabel.sum() | |
predsTop = preds[:cutoff] | |
# out of everything i said was fraud, how much is actually fraud | |
print("Precision: ",np.round(predsTop.anomalyScore[predsTop.trueLabel==1].count()/cutoff,2)) | |
# out of everything that is fraud, how much did i catch | |
print("Recall: ",np.round(predsTop.anomalyScore[predsTop.trueLabel==1].count()/y_train.sum(),2)) | |
print("Fraud Caught out of", cutoff, "Cases:", | |
predsTop.trueLabel.sum(), | |
"Percentage caught:", | |
predsTop.trueLabel.sum() / cutoff) | |
df = pd.read_csv("datasets/credit_card_data/cc.csv") | |
y = df.Class | |
X = df.drop(columns=['Class']) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2018, stratify=y) | |
scalar = pp.StandardScaler().fit(X_train) | |
X_train = pd.DataFrame(scalar.transform(X_train), columns=X.columns, index=X_train.index) | |
X_test = pd.DataFrame(scalar.transform(X_test), columns=X.columns, index=X_test.index) |
X_train_PCA = pca.fit_transform(X_train)
X_train_PCA = pd.DataFrame(data=X_train_PCA, index=X_train.index)
X_train_PCA_inverse = pca.inverse_transform(X_train_PCA)
X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse, index=X_train.index)
scatterPlot(X_train_PCA, y_train, "PCA")```
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
ok