-
-
Save yanping/023ce8ae29b0f5976e99 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------------------------------------------ | |
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING | |
# February 14, 2013 | |
# Joseph B. Rickert | |
# Technical Marketing Manager | |
# | |
# BIG DATA with RevoScaleR | |
# | |
# Copyright: Revolution Analytics | |
# This script is licensed under the GPLv2 license | |
# http://www.gnu.org/licenses/gpl-2.0.html | |
# ---------------------------------------------------------------------- | |
# LOOK AT THE MORTGATE DEFAULT DATA | |
#------------------------------------------------------------------------ | |
dataDir <- "C:/Users/Joseph/Documents/DATA/Mortgage Data/mortDefault" | |
mdata <- file.path(dataDir,"mortDefault.xdf") | |
rxGetInfo(mdata,getVarInfo=TRUE) | |
#----------------------------------------------------------------------------------- | |
## Create a new data file having a variable with uniform random numbers | |
# going from 1 to 10. This variable will be used to create the training and test | |
# data sets. | |
# A little note on how the random numbers are created: | |
# A transform should work on an arbitrary chunk of data. Typically | |
# RevoScaleR functions will test transforms on a small chunk before | |
# fully processing. The internal variable (.rxNumRows) gives the size | |
# of the chunk. | |
rxDataStep(inData = mdata, outFile = "mortDefault2", | |
transforms=list(urns = as.integer(runif(.rxNumRows,1,11))), | |
overwrite=TRUE) | |
rxGetInfo("mortDefault2",getVarInfo=TRUE,numRows=3) | |
# | |
#------------------------------------------------------------ | |
# KMEANS ANALYSIS | |
#------------------------------------------------------------ | |
rxDataStep(inData="mortDefault2",outFile="mortDefault3", | |
varsToDrop="default", | |
overwrite=TRUE) | |
rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5) | |
form <- formula(~ creditScore + houseAge + yearsEmploy + ccDebt + year) | |
md.km <- rxKmeans(formula=form, | |
data = "mortDefault3", | |
numClusters = 3, | |
outFile = "mortDefault3", | |
algorithm = "lloyd", | |
overwrite=TRUE) | |
rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5) | |
md.km | |
# Build a data frame to do a plot | |
mdDf <- rxXdfToDataFrame(file="mortDefault3", | |
rowSelection=urns == 5, | |
maxRowsByCols = 1000) | |
plot(mdDf[,1:4],col=mdDf$.rxCluster) | |
title(main="Clusters in Mortgage Default Data",line=3) | |
###### SCRIPT TO BUILD LOGISTIC REGRESSION MODEL TO PREDICT MORTGAGE DEFAULTS ##### | |
#--------------------------------------------------------------------------- | |
# Some subsidary functions | |
#--------------------------------------------------------------------------- | |
# Function to compute a "long form" of the confusion matrix | |
Cmatrix <- function(df){ | |
df <- as.data.frame(df) | |
df$Result <- c("True Negative","False Negative","False Positive","True Positive") | |
df$PCT <- round(df$Counts/sum(df$Counts),2)*100 | |
df$Rates <- round(c(df$Counts[1]/(df$Counts[1]+df$Counts[3]), | |
df$Counts[2]/(df$Counts[2]+df$Counts[4]), | |
df$Counts[3]/(df$Counts[1]+df$Counts[3]), | |
df$Counts[4]/(df$Counts[2]+df$Counts[4])),2) | |
names(df) <- c("Actual","Predicted","Counts","Results","Pct","Rates") | |
return(df) | |
} | |
#------------------------------------------------------------------------------ | |
##### CREATE TRAINING AND TEST FILES | |
#----------------------------------- | |
#info <- rxGetInfo(mdata) | |
#N <- info$numRows | |
# | |
#------------------------------------------------------------------------------- | |
# BUILD THE TRAINING FILE | |
#------------------------ | |
rxDataStepXdf(inFile = "mortDefault2", | |
outFile = "mdTrain", | |
rowSelection = urns < 9, | |
transforms=list(CS = creditScore, | |
YR = year, | |
yrE = yearsEmploy, | |
HA = houseAge, | |
ccD = ccDebt), | |
blocksPerRead=20, | |
rowsPerRead=500000, | |
overwrite=TRUE ) | |
rxGetInfo("mdTrain",getVarInfo=TRUE,numRows=5) | |
rxHistogram(~default,data="mdTrain") | |
#------------------------- | |
# BUILD THE TEST FILE | |
#------------------------- | |
rxDataStepXdf(inFile = "mortDefault2", | |
outFile = "mdTest", | |
rowSelection = urns > 8, | |
transforms=list(CS = creditScore, | |
YR = year, | |
yrE = yearsEmploy, | |
HA = houseAge, | |
ccD = ccDebt), | |
blocksPerRead=20, | |
rowsPerRead=500000, | |
overwrite=TRUE ) | |
# | |
rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5) | |
rxHistogram(~default,data="mdTest") | |
#--------------------------------------------------------------------------- | |
# BUILD A CLASSIFICATION MODEL USING LOGISTIC REGRESSION | |
#--------------------------------------------------------------------------- | |
system.time( | |
model <- rxLogit(default ~ F(houseAge) + F(year)+ creditScore + yearsEmploy + ccDebt, | |
data="mdTrain", | |
reportProgress=rxGetOption("reportProgress") ) | |
) | |
# | |
#Elapsed computation time: 21.533 secs. | |
#user system elapsed | |
#56.15 12.02 21.55 | |
#Elapsed computation time: 23.149 secs. | |
#user system elapsed | |
#56.81 10.58 23.17 | |
#Elapsed computation time: 24.384 secs. | |
#user system elapsed | |
#59.29 10.31 24.48 | |
summary(model) | |
#---------------------------------------------------------------------- | |
# MAKE PREDICTIONS ON THE TEST DATA USING THE MODEL CREATED ABOVE | |
#---------------------------------------------------------------------- | |
rxPredict(modelObject=model,data="mdTest",outData="mdTest",overwrite=TRUE,predVarNames="LogitPred") | |
rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5) | |
#rxSummary(~default_Pred,data="mdTest") | |
# Add a new prediction variable | |
rxDataStep(inData="mdTest",outFile="mdTest", | |
transforms=list(LogitPred.L = as.logical(round(LogitPred))), | |
overwrite=TRUE) | |
# | |
rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5) | |
#------------------------------------------------------------------------------- | |
# GENERATE THE CONFUSION MATRIX | |
#------------------------------- | |
conMc <- rxCube(~ F(default):F(LogitPred.L),data="mdTest") | |
Cmatrix(conMc) | |
# Examine the performance of the model | |
total.pct.correct <- round(100*(conMc$Counts[1]+conMc$Counts[4]) / sum(conMc$Counts),2) | |
total.pct.correct | |
#----------------------------------------------------------------------------------- | |
# Generate the ROC Curve | |
# | |
rxRocCurve(actualVarName="default",predVarName="LogitPred",data="mdTest") | |
# | |
#------------------------------------------------------------------------------------- | |
# BUILD A TREE MODEL | |
system.time( | |
model.tree <- rxDTree(default ~ HA + YR + CS + yrE + ccD, | |
data="mdTrain", | |
blocksPerRead = 1, | |
maxDepth=5, | |
reportProgress=rxGetOption("reportProgress") ) | |
) | |
## | |
#Elapsed time for RxDTreeBase: 89.545 secs. | |
# | |
#user system elapsed | |
#245.13 12.50 89.57 | |
#Elapsed time for RxDTreeBase: 403.785 secs. | |
# This was to fully build out the tree | |
#user system elapsed | |
#1092.37 75.89 403.83 | |
model.tree | |
# | |
#---------------------------------------------------------------- | |
# Plot the Tree | |
plot(rxAddInheritance(model.tree),uniform=TRUE) | |
text(rxAddInheritance(model.tree),digits=2) | |
title(main="Classification Tree for Mortgage Data", | |
sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"])) | |
#------------------------------------------------------------------- | |
###### - END DEMO HERE - ########### | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
#------------------------------------------------------------ | |
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING | |
# February 14, 2013 | |
# Joseph B. Rickert | |
# Technical Marketing Manager | |
# | |
# GETTING STARTED | |
# | |
# Copyright: Revolution Analytics | |
# This script is licensed under the GPLv2 license | |
# http://www.gnu.org/licenses/gpl-2.0.html | |
#--------------------------------------------------------------------- | |
# Execute the following command to install all of the packages needed for the webinar | |
#install.packages(c( "ada","boot","caret","corrplot","doParallel","ellipse", | |
#"ISwR","partykit","pROC","rattle","RColorBrewer", | |
#"rpart","Snowball","ROCR","tm","twitteR","wordcloud")) | |
# | |
#---------------------------------------------------------------------- | |
# A First look at R | |
# A simple regression example from | |
# Statistics and Computing, Introductory Statistics with R | |
# Peter Dalgaard, Springer 2002 | |
## | |
library(ISwR) # Load a library | |
data() # Have a look at what data sets are available | |
data(thuesen) # Load thuesen into the environment | |
thuesen # Have a look at it | |
class(thuesen) # Find out what kind of object thuesen is | |
sapply(thuesen,class) # See what kinds of animal the variables are | |
# | |
plot(short.velocity ~ blood.glucose, data=thuesen) #plot the data using the formula interface | |
# | |
plot(thuesen$blood.glucose,thuesen$short.velocity) # plot the data by indexining into the data frame | |
# | |
model <- lm(short.velocity ~ blood.glucose, data=thuesen) # build a linear model | |
summary(model) # Look at the results | |
str(model) # Look at the structure of the model object | |
# Build a fancier plot | |
plot(x=thuesen$blood.glucose, | |
y=thuesen$short.velocity, | |
xlab="blood glucose (mmol / l)", | |
ylab = "circumferential shortening velocity (%/s)", | |
main = "Thuesen Data set", | |
col="blue", | |
pch=19 | |
) | |
abline(model,col="red") | |
# | |
par(mfrow=c(2,2)) # Set up for multiple plots | |
plot(model, col="blue") # look at some diagnostics | |
#--------------------------------------------------------------------- | |
# | |
# A FIRST LOOK AT FUNCTIONS | |
# | |
# Let's create a simple function | |
joe.stats <- function(data){ | |
min <- min(data) | |
max <- max(data) | |
q <- quantile(data,probs=seq(0,1,0.25)) | |
res <- list(min,max,q) | |
return(res) | |
} | |
attach(thuesen) # make the columns of thuesen available | |
# in the global environment as variables | |
joe.stats(blood.glucose) # Run our function | |
summary(blood.glucose) # R does it better | |
# Set up for later | |
rm(list=ls()) | |
load("WEBINAR_2-14-13_Intro_R_DM_caret .RData") | |
#-------------------------------------------------------------------------------- | |
#SOME ADDITIONAL ONLINE RESOURCES | |
#An Introduction to R | |
#Notes on R: A Programming Environment for Data Analysis and Graphics | |
#Version 2.15.2 (2012-10-26) | |
#http://cran.r-project.org/doc/manuals/R-intro.pdf | |
# | |
#Using R for Data Analysis and Graphics | |
#Introduction, Code and Commentary | |
#J H Maindonald | |
#http://cran.r-project.org/doc/contrib/usingR.pdf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------------------------------------------------------ | |
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING | |
# February 14, 2013 | |
# Joseph B. Rickert | |
# Technical Marketing Manager | |
# | |
#### BUILD A TREE MODEL WITH RPART AND EVALUATE ##### | |
# | |
# Copyright: Revolution Analytics | |
# This script is licensed under the GPLv2 license | |
# http://www.gnu.org/licenses/gpl-2.0.html | |
#------------------------------------------------------------------------- | |
# This script divides the data into training, validation and testing data, | |
# builds two different decision trees (rpart) using the training data and | |
# evaluates their performance using the test data set | |
# An ROC curve is produced for the better model | |
#------------------------------------------------------------------------ | |
library(rattle) | |
library(rpart) | |
library(ROCR) | |
library(caret) | |
# ----------------------------------------------------------------------- | |
# Read in the data from disk | |
# name <- "weather.csv" | |
# path <- file.path(getwd(),name) | |
# weather <- read.csv(path,header=TRUE) | |
# Show weather on the IDE editor | |
data(weather) | |
head(weather) | |
#------------------------------------------------------------------------ | |
# Select variables for the model | |
weather <- subset(weather,select=c(MinTemp:RainTomorrow)) | |
set.seed(42) # Set seed | |
#------------------------------------------------------------------------- | |
# Determined the observations for the training,validate and test datasets. | |
N <- nrow(weather) # 366 observations | |
train <- sample(N, 0.8*N) # 292 observations | |
test <- setdiff(seq_len(N),train) # 74 observations not intrain | |
#------------------------------------------------------------------------- | |
# Build the model | |
M <- ncol(weather) | |
input <- names(weather)[1:(M-2)] # names of input variables | |
target <- "RainTomorrow" # name of target variable | |
form <- formula(RainTomorrow ~ .) # Describe the model to R | |
tree.m <- rpart(RainTomorrow ~ ., | |
data=weather[train, c(input,target)], | |
method="class", | |
parms=list(split="information"), | |
control=rpart.control(usesurrogate=0, maxsurrogate=0)) | |
#--------------------------------------------------------------------------- | |
# Look at the textual description of the tree. | |
tree.m # print the model | |
printcp(tree.m) # print the table of optimal prunings based on the complexity parameter | |
#---------------------------------------------------------------------------- | |
# Plot the tree | |
drawTreeNodes(tree.m) | |
title(main="Weather Data tree.m", | |
sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"])) | |
#---------------------------------------------------------------------------- | |
# Evaluate performance | |
# Run the tree model on the validate set | |
pred <- predict(tree.m, weather[test, c(input,target)], type="class") | |
levels(pred) <- c("Yes","No") # change order of levesl to match documentation for confusionMatrix | |
# Generate the confusion matrix | |
actual <- weather[test, c(input,target)]$RainTomorrow | |
levels(actual) <- c("Yes","No") # change order of levels to match documantation for confusion matrix | |
AP <- c("Predicted","Actual") # row names for CM | |
CM <- table(pred,actual,dnn=AP) # CM counts | |
confusionMatrix(CM) # from the caret package | |
?confusionMatrix # Look at meaning of confusionMatrix outputs | |
# Notes | |
# The\no-information rate"shown on the output is the largest proportion of the observed classes | |
# A one-sided hypothesis test is computed to evaluate whether the overall accuracy rate is greater | |
# than the rate of the largest class. This is helpful for data sets where there is a large imbalance | |
# between the classes. | |
# | |
# The kappa statistic yields a measure of how well the actual and predicted values agree | |
# See http://www.chestx-ray.com/statistics/kappa.html or | |
# http://en.wikipedia.org/wiki/Cohen%27s_kappa | |
# | |
# The null hypothesis for McNemar's chi squared test is that the actual and predicted | |
# probabilities are the same | |
# See http://en.wikipedia.org/wiki/McNemar%27s_test | |
# | |
#-------------------------------------------------------------------------------------------- | |
# Try another model using different variables | |
form <- formula(RainTomorrow ~ Cloud9am + Pressure9am + WindDir9am + Temp9am + Humidity9am) | |
tree.m2 <- rpart(form, | |
data=weather[train, c(input,target)], | |
method="class", | |
parms=list(split="information"), | |
control=rpart.control(usesurrogate=2, | |
maxsurrogate=0, | |
minsplit=30, | |
maxdepth=20)) | |
#---------------------------------------------------------------------------------------------- | |
# Plot the new tree | |
drawTreeNodes(tree.m2) | |
title(main="Weather Data tree.m2", | |
sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"])) | |
tree.mod.p <- as.party(tree.m2) # make the tree.mod object into a party object | |
plot(tree.mod.p) | |
#---------------------------------------------------------------------------- | |
# Evaluate performance of the new model on the test set | |
pred2 <- predict(tree.m2, weather[test, c(input,target)], type="class") | |
levels(pred2) <- c("Yes","No") | |
CM2 <- table(pred2,actual,dnn=AP) | |
confusionMatrix(CM2) | |
# ----------------------------------------------------------------------------------- | |
# | |
# GENERATE THE ROC CURVE FOR THE BEST MODEL | |
prROC <- predict(tree.m, weather[test, c(input,target)])[,2] | |
# | |
# Get vector RainTommorrow in test data set | |
testRT <- weather[test, c(input,target)]$RainTomorrow | |
pr <- prediction(prROC, testRT) | |
#------------------------------------------------------------------------------------ | |
# Plot the ROC curve | |
plot(performance(pr, "tpr", "fpr"), col="#CC0000FF", lty=1, lwd=2,add=FALSE) | |
#fpr: False positive rate. P(Yhat = + | Y = -). Estimated as: FP/N. | |
#tpr: True positive rate. P(Yhat = + | Y = +). Estimated as: TP/P. | |
segments(0,0,1,1,col="blue",lwd=2) | |
# Add a legend to the plot. | |
legend("bottomright", c("tree.m"), col=rainbow(1, 1, .8), lty=1:1, title="Models", inset=c(0.05, 0.05)) | |
# Add decorations to the plot. | |
title(main="ROC Curve weather.csv [test data]", | |
sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"])) | |
# | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------------------------------------------------------------ | |
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING | |
# February 14, 2013 | |
# Joseph B. Rickert | |
# Technical Marketing Manager | |
# | |
# DATA MINING with CARET | |
# | |
# Copyright: Revolution Analytics | |
# This script is licensed under the GPLv2 license | |
# http://www.gnu.org/licenses/gpl-2.0.html | |
#------------------------------------------------------------------------------ | |
# INTRODUCTION TO THE CARET PACKAGE | |
# caret is a feature rich package for doing data mining in R. | |
# This script explores caret's capabilities using data included in the | |
# package that was described in the paper: | |
# Hill et al "Impact of image segmentation on high-content | |
# screening data quality for SK-BR-3 cells" | |
# BMC fioinformatics (2007) vol 8 (1) pp. 340 | |
# | |
# Background | |
# Well-segmented cells are cells for which location and size may be accurrately detremined | |
# through optical measurements. Cells that are not Well-segmented (WS) are said to be | |
# Poorly-segmented (PS). | |
# | |
# Problem | |
# Given a set of optical measurements can we predict which cells will be PS? | |
# This is a classic classification problem | |
#--------------------------- | |
library(ada) # Boosting algorithms | |
library(caret) | |
library(rpart) # CART algorithm for decision trees | |
library(partykit) # Plotting trees | |
library(doParallel) # parallel processing | |
# by default | |
# Multicore functionality on Unix (single machine only) | |
# Snow functionality on Windows (cluster) | |
library(pROC) # plot the ROC curve | |
library(corrplot) # plot correlations | |
#--------------------------- | |
# data(package="caret") | |
data(segmentationData) # Load the segmentation data set | |
dim(segmentationData) | |
head(segmentationData) # Have a look at the data | |
#[1] 2019 61 | |
trainIndex <- createDataPartition(segmentationData$Case,p=.5,list=FALSE) | |
trainData <- segmentationData[trainIndex,] | |
dim(trainData) | |
#1010 61 | |
testData <- segmentationData[-trainIndex,] | |
dim(testData) | |
#1009 61 | |
#------------------------------------------------------------------------------------- | |
# VISUALIZE CORRELATIONS | |
trainV <- trainData[,4:61] | |
corrplot(cor(trainV),order="hclust",tl.cex=.5,method="ellipse") | |
#----------------------------------------------------------------- | |
# BUILD AN ADABOOST MODEL WITH ADA | |
form <- formula(Class ~ .) | |
control <- rpart.control(maxdepth=30, # the maximum depth of any node of the final tree | |
cp = 0.01, # complexity parameter. Any split that does not decrease the overall lack of fit by a factor of cp is not attempted. | |
minsplit=20, # the minimum number of observations that must exist in a node in order for a split to be attempted | |
xval=10) # number of cross-validations | |
ada.model <- ada(formula=form, | |
data=trainData, | |
control=control, | |
nu = .01, # shrinkage parameter for boosting | |
iter=50) | |
ada.model$model[[1]] # Look at the trees in the model | |
ada.model # look at the model performance | |
plot(ada.model,TRUE) # Plot error rate vs. iterations of the model | |
varplot(ada.model) # Variable importance plot | |
#---------------------------------------------------------------------- | |
# FIND THE "BEST" MODEL | |
# | |
# This is an interesting model, but how do you select the best values for the | |
# for the three tuning parameters? | |
# nu | |
# iter | |
# maxdepth | |
#--------------------------------------------------------------------------------- | |
# Algorithm for training the model: | |
# for each resampled data set do | |
# hold out some samples | |
# for each combination of the three tuning parameters | |
# do | |
# Fit the model on the resampled data set | |
# Predict the values of class on the hold out samples | |
# end | |
# Calculate AUC: the area under the ROC for each sample | |
# Select the combination of tuning parmeters that yields the best AUC | |
# | |
# caret provides the "train" function to do all of this | |
# | |
# The trainControl function to set the training method | |
# Note the default method of picking the best model is accuracy and Cohen's Kappa | |
# | |
#----------------------------------------------------------------------------------- | |
# Set up the parameters to run the boosting function | |
ctrl <- trainControl(method="repeatedcv", # use repeated 10fold cross validation | |
number=5, # the number of folds | |
repeats=2, # do 2 repititions of 5-fold cv | |
summaryFunction=twoClassSummary, # Use AUC to pick the best model | |
classProbs=TRUE) | |
# Use the expand.grid to specify the search space | |
# Note that the default search grid selects 3 values of each tuning parameter | |
# | |
grid <- expand.grid(.nu=c(.1,1), # | |
.iter=c(20,50), | |
.maxdepth=c(20,30)) # | |
# | |
set.seed(1) | |
#names(trainData) | |
trainX <-trainData[,4:61] | |
#----------------------------------------------------------------- | |
# PARALLEL COMPUTING | |
# vignette("gettingstartedParallel") | |
cl <- makeCluster(4) # Use this to manually create a cluster | |
# But, since I only have a single Windows machine | |
# all I am doing is passing the number of cores to use to | |
# registerDoParallel() | |
registerDoParallel(cl) # Registrer a parallel backend for train | |
getDoParWorkers() | |
system.time(ada.tune <- train(x=trainX,y=trainData$Class, | |
method = "ada", | |
metric = "ROC", | |
trControl = ctrl, | |
control=control, | |
tuneGrid=grid)) | |
# | |
stopCluster(cl) | |
#user system elapsed | |
#14.33 0.02 206.25 | |
#------------------------------------------------------------------------------- | |
# ADA RESULTS | |
ada.tune # Look at the results for the training grid | |
ada.tune$finalModel # Look at the performance of the final model | |
plot(ada.tune) # Plot the performance of the training models | |
#-------------------------------------------------------------------------------- | |
# ADA PREDICTIONS | |
testX <- testData[,4:61] | |
ada.pred <- predict(ada.tune,testX) | |
# | |
confusionMatrix(ada.pred,testData$Class) | |
#----------------------------------------------------------------- | |
# DRAW THE ROC CURVE | |
# Use roc function from the pROC package | |
ada.probs <- predict(ada.tune,testX,type="prob") | |
ada.ROC <- roc(predictor=ada.probs$PS, | |
response=testData$Class, | |
levels=rev(levels(testData$Class))) | |
plot(ada.ROC,col=2) | |
ada.ROC$auc # Get the area under the curve | |
#------------------------------------------------------------------------------------ | |
# | |
# SUPPORT VECTOR MACHINE MODEL | |
# | |
set.seed(1) | |
registerDoParallel(4,cores=4) | |
getDoParWorkers() | |
system.time( | |
svm.tune <- train(x=trainX, | |
y= trainData$Class, | |
method = "svmRadial", | |
tuneLength = 5, # 5 values of the cost function | |
preProc = c("center","scale"), | |
metric="ROC", | |
trControl=ctrl) # same as for ada above | |
) | |
#user system elapsed | |
#2.40 0.14 26.10 | |
#-------------------------------------------------------------- | |
# SVM RESULTS | |
svm.tune # Look at the results for the training grid | |
svm.tune$finalModel # Look at the performance of the final mode | |
plot(svm.tune, | |
metric="ROC", | |
scales=list(x=list(log=2))) | |
#--------------------------------------------------------------- | |
# SVM PREDICTIONS | |
svm.pred <- predict(svm.tune,testX) | |
confusionMatrix(svm.pred,testData$Class) | |
# | |
#---------------------------------------------------------------- | |
# COMPARE THE SVM AND ADA MODELS USING RESAMPLING | |
# | |
# Because we set the same seed before running the models we can compare the models using resampling | |
# See Hothorn at al, "The design and analysis of benchmark experiments" | |
# Journal of Computational and Graphical Statistics (2005) vol 14 (3) pp 675-699 | |
# for comparing models using resampling. | |
# | |
# The resamples function in caret collates the resampling results from the two models | |
rValues <- resamples(list(svm=svm.tune,ada=ada.tune)) | |
rValues$values # Look at the resample values | |
summary(rValues) # Summarize the resamples | |
#--------------------------------------------- | |
xyplot(rValues,metric="ROC") # scatter plot | |
bwplot(rValues,metric="ROC") # boxplot | |
parallel(rValues,metric="ROC") # parallel plot | |
dotplot(rValues,metric="ROC") # dotplot | |
# |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################## | |
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING | |
# February 14, 2013 | |
# Joseph B. Rickert | |
# Technical Marketing Manager | |
# | |
# ROLL with RATTLE | |
# | |
# Copyright: Revolution Analytics | |
# This script is licensed under the GPLv2 license | |
# http://www.gnu.org/licenses/gpl-2.0.html | |
################################################################################# | |
# | |
library(rattle) # load the rattle package | |
rattle() # start the rattle user interface | |
# | |
#data() # see what data sets are available in all of the loaded packages | |
data(package="rattle") # see what data sets are availeble in rattle | |
ls("package:rattle") # See what functions are in the Rattle package | |
#lsf.str("package:rattle") # see what functions are in rattle | |
# | |
# THE FOLLOWING INSTRUCTIONS SHOULD BE HELPFUL FOR EXPLORING THE RATTLE GUI | |
# | |
# LOAD THE WEATHER DATA SET. | |
# The weather data set consists of observations made at a weather monitoring station | |
# in Canberra, Australia. Each ovservation describs weather conditions on a particular day. | |
# See page 25 of Gram Willians' Data Mining with Rattle, The Art of Excavating Data for | |
# Knowlwdge Discovery, Springer 2011 | |
# | |
# Go to the Data Tab and click on Execute | |
# Rattle will ask if you want to use the weather data as default. Click yes. | |
# | |
# SUMMARY STATISTICS | |
# Go to the Explore Tab | |
# Select summary and basics | |
# Hit Execute | |
# | |
# SCATTER PLOTS | |
# Go to the Explore Tab | |
# Select Distributions | |
# Click on Execute | |
# | |
# LOOK AT A SINGLE VARIABLE | |
# Go to Explore Tab | |
# Select RainTomorrow Bar Plot | |
# Hit Execute | |
# This produces a bar plot of the target variable RainTomorrow | |
# 84% of the observations have no rain | |
# A model that always predicts no rain should be about 84% accurate | |
# | |
# INVESTIGATE MULTIPLE VARIABLES | |
# Go to the Explore Tab | |
# In the upper panel select Box Plot and Histogram for both | |
# MaxTemp | |
# Sunshine | |
# Click Execute | |
# | |
# Boxplots top left: Temperature generally higher day before it rains | |
# Boxplots top right: Less sunshine day before it rains | |
# | |
# CORRELATIONS | |
# Go to the Explore Tab | |
# Un select any variables that may be selected | |
# Select Correlation | |
# Click on Execute | |
# | |
# | |
# INTERACTIVELY EXPLORE DATA | |
# Select Interactive and then Lattiscist | |
# In bottom center panel | |
# Select MaxTemp for y axis and | |
# Select Sunshine for x axis | |
# Place crosshair on outlier and right click | |
# | |
# BUILD A TREE MODEL | |
# Go to Model Tab | |
# Select Tree | |
# Click Execute | |
# Click on Draw button to get the graph | |
# Click on Rules button to see rules | |
# Select Log Tab to look at R code | |
# | |
# EVALUATE THE MODEL | |
# Go to the Evaluate tab | |
# Select | |
# Type = Error Matrix | |
# Model = Tree | |
# Data = Testing | |
# Click on Execute | |
# | |
# Error matrix for the Decision Tree model on weather.csv [test] (counts): | |
# | |
#Predicted | |
#Actual No Yes | |
#No 35 6 False positive rate = FP/N = 6/(35+6) = .146 = negatives incorrectly classified / total negatives | |
#Yes 5 10 True positive rate = TP/P = 10/(10+5) = .667 = positives correctly classified / total positives | |
# = sensitivity = recall = hit rate | |
# True negative rate = TN / (FP + TN) = 1 - FP rate = .854 | |
# = specificity | |
# | |
# False positives = 6 = Type I Error (Test rejects true null hypothesis) | |
# False negatives = 5 = Type II Error (Test fails to reject false null hypothesis) | |
#Error matrix for the Decision Tree model on weather.csv [test] (%): | |
# | |
#Predicted | |
#Actual No Yes | |
#No 62 11 62% (35/56) of cases model predicts it won't rain and it didn't | |
#Yes 9 18 18% (10/56) of cases model predicts it would rain and it did | |
# Accracy of test = 62% + 18% = 80% | |
# | |
#Overall error: 0.1964286 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------------------------------------------ | |
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING | |
# February 14, 2013 | |
# Joseph B. Rickert | |
# Technical Marketing Manager | |
# | |
# JUST FOR FUN - BUILD A WORD CLOUD | |
# | |
# Copyright: Revolution Analytics | |
# This script is licensed under the GPLv2 license | |
# http://www.gnu.org/licenses/gpl-2.0.html | |
# From example at RDataMining | |
# http://www.rdatamining.com/examples/text-mining | |
# This page shows an example on text mining of Twitter | |
#----------------------------------------------------------------------------- | |
# Load the libraries necesary | |
library(twitteR) # twitteR provides access to Twitter data | |
library(tm) # tm provides functions for text mining | |
library(Snowball) # Wrappers for Weka Java stemming funcitons | |
library(wordcloud) # wordcloud visualizes the result with a word cloud | |
library(RColorBrewer) # provides the rainbow colors | |
#------------------------------------------------------------------------------ | |
# retrieve the first 100 tweets (or all tweets if fewer than 100) | |
# from the user timeline of @rdatammining | |
# | |
Tweets <- searchTwitter("#rstats",n=100) | |
n <- length(Tweets) | |
# Tweets[1:3] | |
# | |
#------------------------------------------------------------------------------- | |
#Transforming Text | |
#The tweets are first converted to a data frame and then to a corpus. | |
df <- do.call("rbind", lapply(Tweets, as.data.frame)) | |
#dim(df) | |
# Just in case twitter is off-line | |
#df <-read.csv("UseRTweets.csv",header=TRUE,row.names=1) | |
#head(df) | |
# | |
# Build a corpus, which is a collection of text documents | |
# VectorSource specifies that the source is character vectors. | |
myCorpus <- Corpus(VectorSource(df$text)) | |
#After that, the corpus needs a couple of transformations, including | |
#changing letters to lower case, | |
#removing punctuations/numbers and removing stop words. | |
#The general English stop-word list is tailored by | |
#adding "available" and "via" and removing "r". | |
myCorpus <- tm_map(myCorpus, tolower) # lower case | |
myCorpus <- tm_map(myCorpus, removePunctuation) # remove punctuation | |
myCorpus <- tm_map(myCorpus, removeNumbers) # remove numbers | |
# keep "r" by removing it from stopwords | |
myStopwords <- c(stopwords('english'), "available", "via") | |
idx <- which(myStopwords == "r") | |
myStopwords <- myStopwords[-idx] | |
myCorpus <- tm_map(myCorpus, removeWords, myStopwords) | |
#---------------------------------------------------------------------------- | |
#Stemming Words | |
# In many cases, words need to be stemmed to retrieve their radicals. | |
# For instance, "example" and "examples" are both stemmed to "exampl". | |
# However, after that, one may want to complete the stems to their original | |
# forms, so that the words would look "normal". | |
dictCorpus <- myCorpus | |
# stem words in a text document with the snowball stemmers, | |
# which requires packages Snowball, RWeka, rJava, RWekajars | |
myCorpus <- tm_map(myCorpus, stemDocument) | |
#inspect(myCorpus[1:3]) # inspect the first three ``documents" | |
#myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus) # stem completion | |
# | |
# | |
#inspect(myCorpus[1:3]) #Print the first three documents in the built corpus. | |
#---------------------------------------------------------------------------------------- | |
#Building a Document-Term Matrix | |
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1)) | |
# inspect(myDtm[266:270,31:40]) | |
# | |
# findFreqTerms(myDtm, lowfreq=10) #Frequent Terms and Associations | |
# findAssocs(myDtm, 'analytics', 0.30) # which words are associated with "analytics"? | |
#----------------------------------------------------------------------------------------- | |
#Build the word cloud | |
#After building a document-term matrix, we can show the importance of | |
#words with a word cloud (also kown as a tag cloud) . | |
m <- as.matrix(myDtm) | |
# calculate the frequency of words | |
v <- sort(rowSums(m), decreasing=TRUE) | |
myNames <- names(v) | |
d <- data.frame(word=myNames, freq=v) | |
# Plot the word cloud | |
pal <- brewer.pal(6,"Dark2") | |
pal <- pal[-(1)] | |
#random colors | |
wordcloud(d$word,d$freq,c(4,1),2,,TRUE,TRUE,.15,pal) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment