Created
March 30, 2018 20:32
-
-
Save khakieconomics/bbc56c0cc5b4c51f70ef3a9b34205b0b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This gist uses the classifier defined in this post: http://modernstatisticalworkflow.blogspot.com/2018/03/1000-labels-and-4500-observations-have.html | |
# applied with this approximation: https://gist.github.com/khakieconomics/0325c054b1499d5037a1de5d1014645a | |
# to Kaggle's credit card fraud data--a fun rare-case problem. In a cross-validation exercise with 50 randomly selected hold-out | |
# sets, it appears perform similarly (or perhaps better) than others' attempts using random forests and neural networks. | |
# The upside of course is that it estimates/generates predictions in a couple of seconds. | |
library(tidyverse); library(reticulate); library(pROC) | |
# Download some data from Kaggle | |
system("kaggle datasets download -d mlg-ulb/creditcardfraud -p ~/Documents/kagglestuff") | |
# Read it -- there's a row in there that's got a float in an integer position | |
credit_cards <- read_csv("~/Documents/kagglestuff/creditcard.csv") | |
credit_cards <- credit_cards %>% filter(complete.cases(.)) | |
labels <- credit_cards$Class | |
# We're going to discretize each column | |
N <- 10 | |
# Some functions we'll need | |
make_percentiles <- function(x) ordered(ntile(x, n = N)) | |
softmax <- function(x) exp(x)/sum(exp(x)) | |
pd <- import("pandas") | |
X <- credit_cards %>% | |
dplyr::select(-Class) %>% | |
mutate_all(.funs = funs(make_percentiles)) %>% | |
apply(2, pd$get_dummies) %>% | |
as.data.frame() | |
# Let's do some random cross-validation | |
dev.off() | |
folds <- 50 | |
aucs <- NULL | |
for(i in 1:folds) { | |
training_rows <- sample(1:nrow(X), round(nrow(X)*((folds - 1)/folds))) | |
training_X <- X[training_rows,] %>% as.matrix | |
testing_X <- X[-training_rows,] %>% as.matrix | |
training_labels <- labels[training_rows] | |
testing_labels <- labels[-training_rows] | |
almost_average <- function(x) mean(c(x, 1/N)) | |
X2 <- bind_cols(labels = training_labels, as_data_frame(training_X)) %>% | |
group_by(labels) %>% | |
summarise_all(.funs= funs(almost_average)) %>% | |
select(-labels) %>% | |
as.matrix | |
t_probs <- log(X2) | |
n_probs <- log(1 - X2) | |
# Approximate log likelihood | |
log_likelihoods <- testing_X %*% t(t_probs) + (1 - testing_X) %*% t(n_probs) | |
predicted_probs <- t(apply(log_likelihoods, 1, function(x) exp(x)/sum(exp(x)))) | |
roc_1 <- roc(testing_labels, predicted_probs[,2]) | |
aucs[i] <- roc_1$auc | |
par(new = T) | |
print(plot(roc_1, col = i)) | |
} | |
data_frame(auc = aucs) %>% | |
ggplot(aes(y = auc, x = 1)) + | |
geom_violin(alpha = 0.6) + | |
labs(title = "Matrix multiplication + dplyr beats nets/forests", | |
subtitle = "AUC from 50 randomly-selected holdout sets (1/50th of sample)") | |
mean(aucs) | |
data_frame(auc = aucs) %>% | |
ggplot(aes(x = auc)) + | |
geom_histogram(alpha = 0.6) + | |
labs(title = "Matrix multiplication + dplyr beats nets/forests", | |
subtitle = "AUC from 50 randomly-selected holdout sets (1/50th of sample)") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment