chyld · November 8, 2019 18:36 · chyld · Nov 8, 2019 · chyld · Nov 8, 2019
diff --git a/code.py b/code.py
 def anomalyScores(originalDF, reducedDF):
    loss = np.sum((np.array(originalDF)-np.array(reducedDF))**2, axis=1)
    loss = pd.Series(data=loss,index=originalDF.index)
    loss = (loss-np.min(loss))/(np.max(loss)-np.min(loss))
    # loss is between 0 and 1 ... 1 being highest reconstruction error
    return loss

 def plotResults(trueLabels, anomalyScores, returnPreds = False):
    preds = pd.concat([trueLabels, anomalyScores], axis=1)
    preds.columns = ['trueLabel', 'anomalyScore']
    precision, recall, thresholds = precision_recall_curve(preds['trueLabel'],preds['anomalyScore'])
    average_precision = average_precision_score(preds['trueLabel'],preds['anomalyScore'])
    
    plt.step(recall, precision, color='k', alpha=0.7, where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    
    plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(average_precision))

    fpr, tpr, thresholds = roc_curve(preds['trueLabel'], preds['anomalyScore'])
    areaUnderROC = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, color='r', lw=2, label='ROC curve')
    plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic: \
    Area under the curve = {0:0.2f}'.format(areaUnderROC))
    plt.legend(loc="lower right")
    plt.show()
    
    if returnPreds==True:
        return preds
    
 def scatterPlot(xDF, yDF, algoName):
    tempDF = pd.DataFrame(data=xDF.loc[:,0:1], index=xDF.index)
    tempDF = pd.concat((tempDF,yDF), axis=1, join="inner")
    tempDF.columns = ["First Vector", "Second Vector", "Label"]
    sns.lmplot(x="First Vector", y="Second Vector", hue="Label", \
               data=tempDF, fit_reg=False)
    ax = plt.gca()
    ax.set_title("Separation of Observations using "+algoName)
    
 def print_stats(preds):
    preds.sort_values(by="anomalyScore",ascending=False,inplace=True)
    cutoff = preds.trueLabel.sum()
    predsTop = preds[:cutoff]
    # out of everything i said was fraud, how much is actually fraud
    print("Precision: ",np.round(predsTop.anomalyScore[predsTop.trueLabel==1].count()/cutoff,2))
    # out of everything that is fraud, how much did i catch
    print("Recall: ",np.round(predsTop.anomalyScore[predsTop.trueLabel==1].count()/y_train.sum(),2))
    print("Fraud Caught out of", cutoff, "Cases:", 
          predsTop.trueLabel.sum(), 
          "Percentage caught:", 
          predsTop.trueLabel.sum() / cutoff)
          
          
          
          
 df = pd.read_csv("datasets/credit_card_data/cc.csv")
 y = df.Class
 X = df.drop(columns=['Class'])
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2018, stratify=y)
 scalar = pp.StandardScaler().fit(X_train)
 X_train = pd.DataFrame(scalar.transform(X_train), columns=X.columns, index=X_train.index)
 X_test = pd.DataFrame(scalar.transform(X_test), columns=X.columns, index=X_test.index)
	def anomalyScores(originalDF, reducedDF):
	loss = np.sum((np.array(originalDF)-np.array(reducedDF))**2, axis=1)
	loss = pd.Series(data=loss,index=originalDF.index)
	loss = (loss-np.min(loss))/(np.max(loss)-np.min(loss))
	# loss is between 0 and 1 ... 1 being highest reconstruction error
	return loss

	def plotResults(trueLabels, anomalyScores, returnPreds = False):
	preds = pd.concat([trueLabels, anomalyScores], axis=1)
	preds.columns = ['trueLabel', 'anomalyScore']
	precision, recall, thresholds = precision_recall_curve(preds['trueLabel'],preds['anomalyScore'])
	average_precision = average_precision_score(preds['trueLabel'],preds['anomalyScore'])

	plt.step(recall, precision, color='k', alpha=0.7, where='post')
	plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')

	plt.xlabel('Recall')
	plt.ylabel('Precision')
	plt.ylim([0.0, 1.05])
	plt.xlim([0.0, 1.0])

	plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(average_precision))

	fpr, tpr, thresholds = roc_curve(preds['trueLabel'], preds['anomalyScore'])
	areaUnderROC = auc(fpr, tpr)

	plt.figure()
	plt.plot(fpr, tpr, color='r', lw=2, label='ROC curve')
	plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
	plt.xlim([0.0, 1.0])
	plt.ylim([0.0, 1.05])
	plt.xlabel('False Positive Rate')
	plt.ylabel('True Positive Rate')
	plt.title('Receiver operating characteristic: \
	Area under the curve = {0:0.2f}'.format(areaUnderROC))
	plt.legend(loc="lower right")
	plt.show()

	if returnPreds==True:
	return preds

	def scatterPlot(xDF, yDF, algoName):
	tempDF = pd.DataFrame(data=xDF.loc[:,0:1], index=xDF.index)
	tempDF = pd.concat((tempDF,yDF), axis=1, join="inner")
	tempDF.columns = ["First Vector", "Second Vector", "Label"]
	sns.lmplot(x="First Vector", y="Second Vector", hue="Label", \
	data=tempDF, fit_reg=False)
	ax = plt.gca()
	ax.set_title("Separation of Observations using "+algoName)

	def print_stats(preds):
	preds.sort_values(by="anomalyScore",ascending=False,inplace=True)
	cutoff = preds.trueLabel.sum()
	predsTop = preds[:cutoff]
	# out of everything i said was fraud, how much is actually fraud
	print("Precision: ",np.round(predsTop.anomalyScore[predsTop.trueLabel==1].count()/cutoff,2))
	# out of everything that is fraud, how much did i catch
	print("Recall: ",np.round(predsTop.anomalyScore[predsTop.trueLabel==1].count()/y_train.sum(),2))
	print("Fraud Caught out of", cutoff, "Cases:",
	predsTop.trueLabel.sum(),
	"Percentage caught:",
	predsTop.trueLabel.sum() / cutoff)




	df = pd.read_csv("datasets/credit_card_data/cc.csv")
	y = df.Class
	X = df.drop(columns=['Class'])
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2018, stratify=y)
	scalar = pp.StandardScaler().fit(X_train)
	X_train = pd.DataFrame(scalar.transform(X_train), columns=X.columns, index=X_train.index)
	X_test = pd.DataFrame(scalar.transform(X_test), columns=X.columns, index=X_test.index)