Created
April 22, 2011 22:23
-
-
Save HarlanH/937821 to your computer and use it in GitHub Desktop.
Visualizing Categorizations blog post code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Demo of techniques to visualize the predictions made by a categorization model. | |
library(ROCR) | |
library(ggplot2) | |
load(url('http://dl.dropbox.com/u/7644953/classifier-visualization.Rdata')) | |
pred.df$actual.bin <- ifelse(pred.df$actual == 'yes', 1, 0) | |
pred.df <- pred.df[order(pred.df$predicted, decreasing=TRUE), ] | |
pred.df$cumsum <- cumsum(pred.df$actual.bin)/sum(pred.df$actual.bin) | |
pred.df$cumsum.yes <- with(pred.df, cumsum * actual.bin) | |
head(pred.df) | |
summary(pred.df) | |
baserate <- mean(pred.df$actual.bin) | |
# visualizations using ROCR | |
# convert to their object type (labels should be some sort of ordered type) | |
pred.rocr <- prediction(pred.df$predicted, pred.df$actual) | |
# some scalar summary stats | |
# Area Under the ROC Curve | |
performance(pred.rocr, 'auc')@y.values[[1]] | |
# Precision/Recall breakeven | |
performance(pred.rocr, 'prbe')@x.values[[1]] | |
# some ROCR graphs | |
# ROC Curve | |
plot(performance(pred.rocr, 'tpr', 'fpr')) | |
# Sensitivity/Specificity | |
plot(performance(pred.rocr, 'sens', 'spec')) | |
# Lift Curve | |
plot(performance(pred.rocr, 'lift', 'rpp')) | |
lift.perf <- performance(pred.rocr, 'lift', 'rpp') | |
lift.df <- data.frame([email protected][[1]], # alpha is cutoff | |
[email protected][[1]]) | |
lift.df <- lift.df[2:nrow(lift.df), ] | |
# decile table | |
dec.table <- ldply((1:10)/10, function(x) data.frame( | |
decile=x, | |
prop.yes=sum(pred.df$actual.bin[1:ceiling(nrow(pred.df)*x)])/sum(pred.df$actual.bin), | |
lift=mean(pred.df$actual.bin[1:ceiling(nrow(pred.df)*x)])/mean(pred.df$actual.bin))) | |
print(dec.table, digits=2) | |
# dual smear graph | |
plot.dual.smear <- ggplot(pred.df, aes(predicted, actual.bin)) + | |
geom_jitter(alpha=.3, position=position_jitter(height=.04,width=0)) + | |
stat_smooth() + | |
geom_abline(slope=1, intercept=0, color='grey', size=.1) + | |
geom_vline(xintercept=baserate, color='orange', size=.5) + | |
scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) + | |
scale_y_continuous('Actual', breaks=(0:4)/4, limits=c(-.05,1.05), labels=c('No', '', '', '', 'Yes')) + | |
coord_equal() | |
# single smear graph | |
plot.single.smear <- ggplot(pred.df, aes(predicted, actual.bin, colour=actual)) + | |
geom_jitter(alpha=.3, position=position_jitter(height=.5,width=0)) + | |
geom_line(data=lift.df, aes(y=lift, color=NULL)) + | |
scale_color_manual(values=c('darkblue', 'darkred')) + | |
scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) + | |
scale_y_continuous('Lift') + | |
geom_vline(xintercept=baserate, color='orange', size=.5) + | |
geom_hline(yintercept=1, color='darkgrey', size=.5) | |
# dual smear with cumulative reach of yesses | |
plot.dual.cumulative <- ggplot(pred.df, aes(predicted, cumsum.yes, colour=actual)) + | |
geom_jitter(alpha=.3, position=position_jitter(height=.04,width=0)) + | |
geom_vline(xintercept=baserate, color='orange', size=.5) + | |
scale_color_manual(values=c('darkblue', 'darkred')) + | |
scale_x_continuous('Predicted/Threshold', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) + | |
scale_y_continuous('Cumulative Proportion of Yesses Seen') | |
# pos/neg densities | |
plot.densities <- ggplot(pred.df, aes(predicted, ..count.., fill=actual)) + | |
stat_density(alpha=.5, position='identity') + | |
scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,1), labels=sprintf('%d%%', (0:4)*25)) + | |
scale_y_sqrt('Density') | |
Harlan, Thanks! Couldn't find your email but added you to my Google+. I'm doing Bioinformatics research. The topic is: Removing Badly Aligned or Unrelated Sequences from Multiple Sequence Alignments. So, here is one of the places where you're code ends up: alevchuk/ms@f348713#L0R37
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Aleksandr, sure! It's in the public domain. If you feel like citing me, a footnote (not a formal citation) with my name and a link to the gist would be appreciated. What's your topic? (Email me directly -- my email address is easy to google.)