yanping · July 18, 2014 17:09
diff --git a/BIG DATA with RevoScale R b/BIG DATA with RevoScale R

 #------------------------------------------------------------ 
 # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
 # February 14, 2013
 # Joseph B. Rickert
 # Technical Marketing Manager
 #
 # BIG DATA with RevoScaleR
 #
 # Copyright: Revolution Analytics
 # This script is licensed under the GPLv2 license
 # http://www.gnu.org/licenses/gpl-2.0.html
 # ----------------------------------------------------------------------
 # LOOK AT THE MORTGATE DEFAULT DATA
 #------------------------------------------------------------------------
 dataDir <- "C:/Users/Joseph/Documents/DATA/Mortgage Data/mortDefault"
 mdata <- file.path(dataDir,"mortDefault.xdf")
 rxGetInfo(mdata,getVarInfo=TRUE)

 #-----------------------------------------------------------------------------------
 ## Create a new data file having a variable with uniform random numbers
 # going from 1 to 10. This variable will be used to create the training and test
 # data sets.
 # A little note on how the random numbers are created:
 #		A transform should work on an arbitrary chunk of data.  Typically
 #		RevoScaleR functions will test transforms on a small chunk before
 #		fully processing. The internal variable (.rxNumRows) gives the size 
 #		of the chunk.  

 rxDataStep(inData = mdata, outFile = "mortDefault2",
      transforms=list(urns = as.integer(runif(.rxNumRows,1,11))),
      overwrite=TRUE)
 rxGetInfo("mortDefault2",getVarInfo=TRUE,numRows=3)

 #
 #------------------------------------------------------------
 # KMEANS ANALYSIS 
 #------------------------------------------------------------
 rxDataStep(inData="mortDefault2",outFile="mortDefault3",
 	       varsToDrop="default",
 		   overwrite=TRUE)
 rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5)

 form <- formula(~ creditScore + houseAge + yearsEmploy + ccDebt + year)
 md.km <- rxKmeans(formula=form, 
 						data = "mortDefault3", 
 						numClusters = 3,
 						outFile = "mortDefault3",
 						algorithm = "lloyd",
 						overwrite=TRUE)
 rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5)	
 md.km
 # Build a data frame to do a plot		
 mdDf <- rxXdfToDataFrame(file="mortDefault3",
 						 rowSelection=urns == 5,
                         maxRowsByCols = 1000)
 			
 plot(mdDf[,1:4],col=mdDf$.rxCluster)
 title(main="Clusters in Mortgage Default Data",line=3)

 ###### SCRIPT TO BUILD LOGISTIC REGRESSION MODEL TO PREDICT MORTGAGE DEFAULTS #####
 #---------------------------------------------------------------------------
 # Some subsidary functions
 #---------------------------------------------------------------------------
 # Function to compute a "long form" of the confusion matrix
 Cmatrix <- function(df){
 	df <- as.data.frame(df)
 	df$Result <- c("True Negative","False Negative","False Positive","True Positive")
 	df$PCT <- round(df$Counts/sum(df$Counts),2)*100
 	df$Rates <- round(c(df$Counts[1]/(df$Counts[1]+df$Counts[3]),
 	               df$Counts[2]/(df$Counts[2]+df$Counts[4]),
 				   df$Counts[3]/(df$Counts[1]+df$Counts[3]),
 	               df$Counts[4]/(df$Counts[2]+df$Counts[4])),2)
 	names(df) <- c("Actual","Predicted","Counts","Results","Pct","Rates")
 	return(df)
 }
 #------------------------------------------------------------------------------
 ##### CREATE TRAINING AND TEST FILES
 #-----------------------------------
 #info <- rxGetInfo(mdata)
 #N <- info$numRows
 #

 #-------------------------------------------------------------------------------
 # BUILD THE TRAINING FILE
 #------------------------
 rxDataStepXdf(inFile = "mortDefault2", 
 	          outFile = "mdTrain",
              rowSelection = urns < 9,
 			  transforms=list(CS = creditScore,
 				              YR = year,
 						      yrE = yearsEmploy,
 						      HA = houseAge,
 							  ccD = ccDebt),
 			  blocksPerRead=20,
 			  rowsPerRead=500000,
 			  overwrite=TRUE )

 rxGetInfo("mdTrain",getVarInfo=TRUE,numRows=5)
 rxHistogram(~default,data="mdTrain")
 #-------------------------
 # BUILD THE TEST FILE
 #-------------------------
 rxDataStepXdf(inFile = "mortDefault2", 
 	          outFile = "mdTest",
              rowSelection = urns > 8,
 			  transforms=list(CS = creditScore,
 				              YR = year,
 						      yrE = yearsEmploy,
 						      HA = houseAge,
 							  ccD = ccDebt),
 			  blocksPerRead=20,
 			  rowsPerRead=500000,
 			  overwrite=TRUE )
 #
 rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
 rxHistogram(~default,data="mdTest")
 #---------------------------------------------------------------------------
 # BUILD A CLASSIFICATION MODEL USING LOGISTIC REGRESSION
 #---------------------------------------------------------------------------
 system.time(
 model <- rxLogit(default ~ F(houseAge) + F(year)+ creditScore + yearsEmploy + ccDebt, 
 	             data="mdTrain", 
 				 reportProgress=rxGetOption("reportProgress") )
 			)
 #
 #Elapsed computation time: 21.533 secs.
   #user  system elapsed 
  #56.15   12.02   21.55 

 	
 #Elapsed computation time: 23.149 secs.
   #user  system elapsed 
  #56.81   10.58   23.17 
 #Elapsed computation time: 24.384 secs.
   #user  system elapsed 
  #59.29   10.31   24.48 

 summary(model)

 #----------------------------------------------------------------------
 # MAKE PREDICTIONS ON THE TEST DATA USING THE MODEL CREATED ABOVE
 #----------------------------------------------------------------------
 rxPredict(modelObject=model,data="mdTest",outData="mdTest",overwrite=TRUE,predVarNames="LogitPred")
 rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
 #rxSummary(~default_Pred,data="mdTest")
 # Add a new prediction variable
 rxDataStep(inData="mdTest",outFile="mdTest",
 	            transforms=list(LogitPred.L = as.logical(round(LogitPred))),
 				overwrite=TRUE)
 #
 rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)

 #-------------------------------------------------------------------------------
 # GENERATE THE CONFUSION MATRIX
 #-------------------------------
 conMc <- rxCube(~ F(default):F(LogitPred.L),data="mdTest")
 Cmatrix(conMc)

 # Examine the performance of the model
 total.pct.correct <- round(100*(conMc$Counts[1]+conMc$Counts[4]) / sum(conMc$Counts),2)
 total.pct.correct
 #-----------------------------------------------------------------------------------
 # Generate the ROC Curve
 #
 rxRocCurve(actualVarName="default",predVarName="LogitPred",data="mdTest")
 #
 #-------------------------------------------------------------------------------------

 # BUILD A TREE MODEL
 system.time(
 model.tree <- rxDTree(default ~ HA + YR + CS + yrE + ccD, 
 	             data="mdTrain", 
 				 blocksPerRead = 1,
 				 maxDepth=5,
 				 reportProgress=rxGetOption("reportProgress") )
 			)
 ##	

 #Elapsed time for RxDTreeBase: 89.545 secs.
 #
   #user  system elapsed 
 #245.13   12.50   89.57


 #Elapsed time for RxDTreeBase: 403.785 secs.
 # This was to fully build out the tree
 #user  system elapsed 
 #1092.37   75.89  403.83 

 model.tree
 #
 #----------------------------------------------------------------
 # Plot the Tree
 plot(rxAddInheritance(model.tree),uniform=TRUE)
 text(rxAddInheritance(model.tree),digits=2)
 title(main="Classification Tree for Mortgage Data",
    sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
 #-------------------------------------------------------------------

 ###### - END DEMO HERE - ###########







diff --git a/GETTING STARTED b/GETTING STARTED
 #
 #------------------------------------------------------------ 
 # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
 # February 14, 2013
 # Joseph B. Rickert
 # Technical Marketing Manager
 #
 # GETTING STARTED
 #
 # Copyright: Revolution Analytics
 # This script is licensed under the GPLv2 license
 # http://www.gnu.org/licenses/gpl-2.0.html

 #---------------------------------------------------------------------
 # Execute the following command to install all of the packages needed for the webinar
 #install.packages(c( "ada","boot","caret","corrplot","doParallel","ellipse",
 			        #"ISwR","partykit","pROC","rattle","RColorBrewer",
 					#"rpart","Snowball","ROCR","tm","twitteR","wordcloud"))
 #
 #----------------------------------------------------------------------
 # A First look at R
 # A simple regression example from
 # Statistics and Computing, Introductory Statistics with R
 # Peter Dalgaard, Springer 2002
 ##
 library(ISwR)			# Load a library
 data()					# Have a look at what data sets are available
 data(thuesen)			# Load thuesen into the environment
 thuesen					# Have a look at it
 class(thuesen)          # Find out what kind of object thuesen is
 sapply(thuesen,class)   # See what kinds of animal the variables are
 #
 plot(short.velocity ~ blood.glucose, data=thuesen) #plot the data using the formula interface
 #
 plot(thuesen$blood.glucose,thuesen$short.velocity) # plot the data by indexining into the data frame
 #
 model <- lm(short.velocity ~ blood.glucose, data=thuesen) # build a linear model
 summary(model)			# Look at the results
 str(model)				# Look at the structure of the model object
 # Build a fancier plot
 plot(x=thuesen$blood.glucose,
 	 y=thuesen$short.velocity,
 	 xlab="blood glucose (mmol / l)",
 	 ylab = "circumferential shortening velocity (%/s)",
 	 main = "Thuesen Data set",
 	 col="blue",
 	 pch=19
 	 )
 abline(model,col="red")
 #
 par(mfrow=c(2,2))				# Set up for multiple plots
 plot(model, col="blue")			# look at some diagnostics

 #---------------------------------------------------------------------
 #
 # A FIRST LOOK AT FUNCTIONS
 #
 # Let's create a simple function
 joe.stats <- function(data){
 	           min <- min(data)
 			   max <- max(data)
 			   q <- quantile(data,probs=seq(0,1,0.25))
 			   res <- list(min,max,q)
 			   return(res)
 		}
 		
 attach(thuesen)			 # make the columns of thuesen available
                         # in the global environment as variables
 joe.stats(blood.glucose) # Run our function
 			   
 summary(blood.glucose)   # R does it better


 # Set up for later
 rm(list=ls())
 load("WEBINAR_2-14-13_Intro_R_DM_caret .RData")
 #--------------------------------------------------------------------------------
 #SOME ADDITIONAL ONLINE RESOURCES
 #An Introduction to R
 #Notes on R: A Programming Environment for Data Analysis and Graphics
 #Version 2.15.2 (2012-10-26)
 #http://cran.r-project.org/doc/manuals/R-intro.pdf
 #
 #Using R for Data Analysis and Graphics
 #Introduction, Code and Commentary
 #J H Maindonald
 #http://cran.r-project.org/doc/contrib/usingR.pdf
diff --git a/IN THE TREES b/IN THE TREES
 #------------------------------------------------------------------------
 # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
 # February 14, 2013
 # Joseph B. Rickert
 # Technical Marketing Manager
 #
 #### BUILD A TREE MODEL WITH RPART AND EVALUATE #####
 #
 # Copyright: Revolution Analytics
 # This script is licensed under the GPLv2 license
 # http://www.gnu.org/licenses/gpl-2.0.html
 #-------------------------------------------------------------------------
 # This script divides the data into training, validation and testing data,
 # builds two different decision trees (rpart) using the training data and
 # evaluates their performance using the test data set
 # An ROC curve is produced for the better model
 #------------------------------------------------------------------------
 library(rattle)
 library(rpart)
 library(ROCR)
 library(caret)
 # -----------------------------------------------------------------------
 # Read in the data from disk
 #	name <- "weather.csv"
 #	path <- file.path(getwd(),name)
 #	weather <- read.csv(path,header=TRUE)
 #	Show weather on the IDE editor
 data(weather)
 head(weather)
 #------------------------------------------------------------------------
 # Select variables for the model
 weather <- subset(weather,select=c(MinTemp:RainTomorrow))
 set.seed(42)											# Set seed
 #-------------------------------------------------------------------------
 # Determined the observations for the training,validate and test datasets.
 N <- nrow(weather)										# 366 observations
 train <- sample(N, 0.8*N)								# 292 observations
 test <-  setdiff(seq_len(N),train)                      #  74 observations not intrain
 #-------------------------------------------------------------------------
 # Build the model
 M <- ncol(weather)
 input  <- names(weather)[1:(M-2)]						# names of input variables
 target <- "RainTomorrow"								# name of target variable							
 form <- formula(RainTomorrow ~ .)						# Describe the model to R
 tree.m <- rpart(RainTomorrow ~ .,
 			    data=weather[train, c(input,target)],
 				method="class",
 				parms=list(split="information"),
 				control=rpart.control(usesurrogate=0, maxsurrogate=0))
 #---------------------------------------------------------------------------
 # Look at the textual description of the tree.
 tree.m						# print the model
 printcp(tree.m)				# print the table of optimal prunings based on the complexity parameter
 #----------------------------------------------------------------------------
 # Plot the tree
 drawTreeNodes(tree.m)
 title(main="Weather Data tree.m",
    sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
 #----------------------------------------------------------------------------		
 # Evaluate performance
 # Run the tree model on the validate set
  pred <- predict(tree.m, weather[test, c(input,target)], type="class")
  levels(pred) <- c("Yes","No")								# change order of levesl to match documentation for confusionMatrix
 # Generate the confusion matrix
  actual <- weather[test, c(input,target)]$RainTomorrow
  levels(actual) <- c("Yes","No")							# change order of levels to match documantation for confusion matrix
  AP <- c("Predicted","Actual")								# row names for CM
  CM <- table(pred,actual,dnn=AP)							# CM counts
  confusionMatrix(CM)										# from the caret package		
  ?confusionMatrix											# Look at meaning of confusionMatrix outputs
 	
 # Notes
 # The\no-information rate"shown on the output is the largest proportion of the observed classes
 # A one-sided hypothesis test is computed to evaluate whether the overall accuracy rate is greater
 # than the rate of the largest class. This is helpful for data sets where there is a large imbalance
 # between the classes.
 #
 # The kappa statistic yields a measure of how well the actual and predicted values agree
 # See http://www.chestx-ray.com/statistics/kappa.html or
 # http://en.wikipedia.org/wiki/Cohen%27s_kappa
 #
 # The null hypothesis for McNemar's chi squared test is that the actual and predicted
 # probabilities are the same
 # See http://en.wikipedia.org/wiki/McNemar%27s_test
 #
 #--------------------------------------------------------------------------------------------
 # Try another model using different variables
 form <- formula(RainTomorrow ~ Cloud9am + Pressure9am + WindDir9am + Temp9am + Humidity9am)
 tree.m2 <- rpart(form,
 			    data=weather[train, c(input,target)],
 				method="class",
 				parms=list(split="information"),
 				control=rpart.control(usesurrogate=2, 
 									  maxsurrogate=0,
 									  minsplit=30,
 									  maxdepth=20))
 #----------------------------------------------------------------------------------------------
 # Plot the new tree
 drawTreeNodes(tree.m2)
 title(main="Weather Data tree.m2",
    sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))

 tree.mod.p <- as.party(tree.m2)		# make the tree.mod object into a party object
 plot(tree.mod.p)

 #----------------------------------------------------------------------------		
 # Evaluate performance of the new model on the test set
 pred2 <- predict(tree.m2, weather[test, c(input,target)], type="class")	
 levels(pred2) <- c("Yes","No")
 CM2 <- table(pred2,actual,dnn=AP)
 confusionMatrix(CM2)
 # -----------------------------------------------------------------------------------
 #
 # GENERATE THE ROC CURVE FOR THE BEST MODEL
 prROC <- predict(tree.m, weather[test, c(input,target)])[,2]
 # 
 # Get vector RainTommorrow in test data set
 testRT <- weather[test, c(input,target)]$RainTomorrow
 pr <- prediction(prROC, testRT)
 #------------------------------------------------------------------------------------
 # Plot the ROC curve
 plot(performance(pr, "tpr", "fpr"), col="#CC0000FF", lty=1, lwd=2,add=FALSE)
 	#fpr: False positive rate. P(Yhat = + | Y = -). Estimated as: FP/N.
    #tpr: True positive rate. P(Yhat = + | Y = +). Estimated as: TP/P.
 segments(0,0,1,1,col="blue",lwd=2)	
 # Add a legend to the plot.
 legend("bottomright", c("tree.m"), col=rainbow(1, 1, .8), lty=1:1, title="Models", inset=c(0.05, 0.05))
 # Add decorations to the plot.
 title(main="ROC Curve  weather.csv [test data]",
      sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
 #

diff --git a/INTRO to CARET b/INTRO to CARET
 
 #------------------------------------------------------------------------------
 # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
 # February 14, 2013
 # Joseph B. Rickert
 # Technical Marketing Manager
 #
 # DATA MINING with CARET
 #
 # Copyright: Revolution Analytics
 # This script is licensed under the GPLv2 license
 # http://www.gnu.org/licenses/gpl-2.0.html
 #------------------------------------------------------------------------------
 # INTRODUCTION TO THE CARET PACKAGE
 # caret is a feature rich package for doing data mining in R.
 # This script explores caret's capabilities using data included in the 
 # package that was described in the paper:
 # Hill et al "Impact of image segmentation on high-content
 # screening data quality for SK-BR-3 cells"
 # BMC fioinformatics (2007) vol 8 (1) pp. 340
 #
 # Background
 # Well-segmented cells are cells for which location and size may be accurrately detremined
 # through optical measurements. Cells that are not Well-segmented (WS) are said to be 
 # Poorly-segmented (PS).
 #
 # Problem
 # Given a set of optical measurements can we predict which cells will be PS?
 # This is a classic classification problem
 #---------------------------
 library(ada)				# Boosting algorithms
 library(caret)
 library(rpart)				# CART algorithm for decision trees
 library(partykit)			# Plotting trees		
 library(doParallel)			# parallel processing
                            # by default
 							#     Multicore functionality on Unix (single machine only)
 							#     Snow functionality on Windows (cluster)
 							
 							
 library(pROC)				# plot the ROC curve
 library(corrplot)			# plot correlations

 #---------------------------
 # data(package="caret")
 data(segmentationData)		# Load the segmentation data set
 dim(segmentationData)
 head(segmentationData)		# Have a look at the data
 #[1] 2019   61
 trainIndex <- createDataPartition(segmentationData$Case,p=.5,list=FALSE)
 trainData <- segmentationData[trainIndex,]
 dim(trainData)
 #1010   61
 testData  <- segmentationData[-trainIndex,]
 dim(testData)
 #1009   61
 #-------------------------------------------------------------------------------------
 # VISUALIZE CORRELATIONS
 trainV <- trainData[,4:61]
 corrplot(cor(trainV),order="hclust",tl.cex=.5,method="ellipse")

 #-----------------------------------------------------------------
 # BUILD AN ADABOOST MODEL WITH ADA
 form <- formula(Class ~ .)
 control <- rpart.control(maxdepth=30,   # the maximum depth of any node of the final tree
 	                     cp = 0.01,     # complexity parameter. Any split that does not decrease the overall lack of fit by a factor of cp is not attempted.
 						 minsplit=20,   # the minimum number of observations that must exist in a node in order for a split to be attempted
 						 xval=10)       # number of cross-validations
 					
 ada.model <- ada(formula=form,
 	             data=trainData,
 				 control=control,
 				 nu = .01,				# shrinkage parameter for boosting
 				 iter=50)

 ada.model$model[[1]]					# Look at the trees in the model
 ada.model								# look at the model performance
 plot(ada.model,TRUE)					# Plot error rate vs. iterations of the model
 varplot(ada.model)						# Variable importance plot
 #----------------------------------------------------------------------
 # FIND THE "BEST" MODEL
 #
 # This is an interesting model, but how do you select the best values for the
 # for the three tuning parameters?
 #		nu
 #		iter
 #		maxdepth
 #---------------------------------------------------------------------------------
 # Algorithm for training the model:
 # for each resampled data set do
 #		hold out some samples
 #		for each combination of the three tuning parameters
 #			do
 #				Fit the model on the resampled data set
 #				Predict the values of class on the hold out samples
 #			end
 #		Calculate AUC: the area under the ROC for each sample
 #		Select the combination of tuning parmeters that yields the best AUC
 #
 # caret provides the "train" function to do all of this
 #
 # The trainControl function to set the training method
 # Note the default method of picking the best model is accuracy and Cohen's Kappa
 #
 #-----------------------------------------------------------------------------------
 # Set up the parameters to run the boosting function
 ctrl <- trainControl(method="repeatedcv",					# use repeated 10fold cross validation
 						number=5,							# the number of folds
 						repeats=2,							# do 2 repititions of 5-fold cv
 						summaryFunction=twoClassSummary,	# Use AUC to pick the best model
 						classProbs=TRUE)
 # Use the expand.grid to specify the search space	
 # Note that the default search grid selects 3 values of each tuning parameter
 #
 grid <- expand.grid(.nu=c(.1,1), # 
 					.iter=c(20,50),
 					.maxdepth=c(20,30))			# 
 #	
 set.seed(1)
 #names(trainData)
 trainX <-trainData[,4:61]
 #-----------------------------------------------------------------
 # PARALLEL COMPUTING
 # vignette("gettingstartedParallel")

 cl <- makeCluster(4)		 # Use this to manually create a cluster
                             # But, since I only have a single Windows machine
 							 # all I am doing is passing the number of cores to use to 
 							 # registerDoParallel()
 registerDoParallel(cl)		 # Registrer a parallel backend for train
 getDoParWorkers()

 system.time(ada.tune <- train(x=trainX,y=trainData$Class,
 				method = "ada",
 				metric = "ROC",
 				trControl = ctrl,
 				control=control,
 				tuneGrid=grid))
 #
 stopCluster(cl)
 
 #user  system elapsed 
 #14.33    0.02  206.25 
 #-------------------------------------------------------------------------------
 # ADA RESULTS
 ada.tune						# Look at the results for the training grid				
 ada.tune$finalModel				# Look at the performance of the final model
 plot(ada.tune)					# Plot the performance of the training models
 #--------------------------------------------------------------------------------
 # ADA PREDICTIONS
 testX <- testData[,4:61]
 ada.pred <- predict(ada.tune,testX)
 #
 confusionMatrix(ada.pred,testData$Class)
 #-----------------------------------------------------------------
 # DRAW THE ROC CURVE
 # Use roc function from the pROC package
 ada.probs <- predict(ada.tune,testX,type="prob")
 ada.ROC <- roc(predictor=ada.probs$PS,
 				response=testData$Class,
 				levels=rev(levels(testData$Class)))
 plot(ada.ROC,col=2)
 ada.ROC$auc		# Get the area under the curve
 #------------------------------------------------------------------------------------
 #
 # SUPPORT VECTOR MACHINE MODEL
 #
 set.seed(1)
 registerDoParallel(4,cores=4)
 getDoParWorkers()
 system.time(
 	svm.tune <- train(x=trainX,
 					y= trainData$Class,
 					method = "svmRadial",
 					tuneLength = 5,					# 5 values of the cost function
 					preProc = c("center","scale"),
 					metric="ROC",
 					trControl=ctrl)					# same as for ada above
 )	
 	
 #user  system elapsed 
   #2.40    0.14   26.10 


 #--------------------------------------------------------------
 # SVM RESULTS
 svm.tune						# Look at the results for the training grid				
 svm.tune$finalModel				# Look at the performance of the final mode
 plot(svm.tune,
 	metric="ROC",
 	scales=list(x=list(log=2)))
 #---------------------------------------------------------------
 # SVM PREDICTIONS
 svm.pred <- predict(svm.tune,testX)
 confusionMatrix(svm.pred,testData$Class)
 #
 #----------------------------------------------------------------
 # COMPARE THE SVM AND ADA MODELS USING RESAMPLING
 #
 # Because we set the same seed before running the models we can compare the models using resampling
 # See Hothorn at al, "The design and analysis of benchmark experiments"
 # Journal of Computational and Graphical Statistics (2005) vol 14 (3) pp 675-699
 # for comparing models using resampling.
 #
 # The resamples function in caret collates the resampling results from the two models
 rValues <- resamples(list(svm=svm.tune,ada=ada.tune))		
 rValues$values					# Look at the resample values	
 summary(rValues)				# Summarize the resamples

 #---------------------------------------------
 xyplot(rValues,metric="ROC")		# scatter plot
 bwplot(rValues,metric="ROC")		# boxplot
 parallel(rValues,metric="ROC")		# parallel plot
 dotplot(rValues,metric="ROC")		# dotplot
 #
diff --git a/ROLL with RATTLE b/ROLL with RATTLE
 ##############################################################################
 # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
 # February 14, 2013
 # Joseph B. Rickert
 # Technical Marketing Manager
 #
 # ROLL with RATTLE
 #
 # Copyright: Revolution Analytics
 # This script is licensed under the GPLv2 license
 # http://www.gnu.org/licenses/gpl-2.0.html

 #################################################################################
 #
 library(rattle)									# load the rattle package
 rattle()										# start the rattle user interface
 #
 #data()											# see what data sets are available in all of the loaded packages
 data(package="rattle")							# see what data sets are availeble in rattle
 ls("package:rattle")							# See what functions are in the Rattle package
 #lsf.str("package:rattle")						# see what functions are in rattle
 #
 # THE FOLLOWING INSTRUCTIONS SHOULD BE HELPFUL FOR EXPLORING THE RATTLE GUI
 #
 # LOAD THE WEATHER DATA SET.
 # The weather data set consists of observations made at a weather monitoring station
 # in Canberra, Australia. Each ovservation describs weather conditions on a particular day.
 # See page 25 of Gram Willians' Data Mining with Rattle, The Art of Excavating Data for 
 # Knowlwdge Discovery, Springer 2011
 #
 #		Go to the Data Tab and click on Execute
 #       Rattle will ask if you want to use the weather data as default. Click yes.
 # 
 # SUMMARY STATISTICS
 #		Go to the Explore Tab
 #		Select summary and basics
 #       Hit Execute
 # 
 # SCATTER PLOTS
 #		Go to the Explore Tab
 #		Select Distributions
 #		Click on Execute
 #
 # LOOK AT A SINGLE VARIABLE
 #		Go to Explore Tab
 #		Select RainTomorrow Bar Plot
 #		Hit Execute
 #		This produces a bar plot of the target variable RainTomorrow
 #		84% of the observations have no rain
 #		A model that always predicts no rain should be about 84% accurate
 #
 # INVESTIGATE MULTIPLE VARIABLES
 #		Go to the Explore Tab
 #       In the upper panel select Box Plot and Histogram for both
 #			MaxTemp
 #			Sunshine
 #		Click Execute
 #
 #		Boxplots top left: Temperature generally higher day before it rains
 #		Boxplots top right: Less sunshine day before it rains
 #
 # CORRELATIONS
 #		Go to the Explore Tab
 #		Un select any variables that may be selected
 #		Select Correlation
 #		Click on Execute
 #
 #
 # INTERACTIVELY EXPLORE DATA
 #	Select Interactive and then Lattiscist
 #	In bottom center panel
 #		Select MaxTemp for y axis and	
 #		Select Sunshine for x axis
 #		Place crosshair on outlier and right click
 #
 # BUILD A TREE MODEL
 #	Go to Model Tab 
 #	Select Tree
 #	Click Execute
 #	Click on Draw button to get the graph
 #	Click on Rules button to see rules
 #	Select Log Tab to look at R code
 #
 # EVALUATE THE MODEL
 #	Go to the Evaluate tab
 #	Select 
 #		Type = Error Matrix
 #		Model = Tree
 #		Data = Testing
 #	Click on Execute
 #
 # Error matrix for the Decision Tree model on weather.csv [test] (counts):
 #
      #Predicted
 #Actual No Yes
   #No  35   6  False positive rate = FP/N =  6/(35+6) = .146   = negatives incorrectly classified / total negatives
   #Yes  5  10  True positive rate  = TP/P = 10/(10+5) = .667   = positives correctly classified / total positives
 #                                   = sensitivity  = recall = hit rate
 #               True negative rate  = TN / (FP + TN) = 1 - FP rate  = .854
 #                                   = specificity
 #
 #               False positives = 6 = Type  I Error (Test rejects true null hypothesis)
 #               False negatives = 5 = Type II Error (Test fails to reject false null hypothesis)

 #Error matrix for the Decision Tree model on weather.csv [test] (%):
 #
      #Predicted
 #Actual No Yes
   #No  62  11    62% (35/56) of cases model predicts it won't rain and it didn't
   #Yes  9  18    18% (10/56) of cases model predicts it would rain and it did
 #                 Accracy of test = 62% + 18% = 80%
 #
 #Overall error: 0.1964286
diff --git a/WORDCLOUD b/WORDCLOUD

 #------------------------------------------------------------ 
 # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
 # February 14, 2013
 # Joseph B. Rickert
 # Technical Marketing Manager
 #
 # JUST FOR FUN - BUILD A WORD CLOUD
 #
 # Copyright: Revolution Analytics
 # This script is licensed under the GPLv2 license
 # http://www.gnu.org/licenses/gpl-2.0.html


 # From example at RDataMining
 # http://www.rdatamining.com/examples/text-mining
 # This page shows an example on text mining of Twitter 
 #-----------------------------------------------------------------------------
 # Load the libraries necesary
 library(twitteR)			# twitteR provides access to Twitter data
 library(tm)					# tm provides functions for text mining
 library(Snowball)           # Wrappers for Weka Java stemming funcitons
 library(wordcloud)          # wordcloud visualizes the result with a word cloud
 library(RColorBrewer)       # provides the rainbow colors
 #------------------------------------------------------------------------------
 # retrieve the first 100 tweets (or all tweets if fewer than 100)
 # from the user timeline of @rdatammining
 #
 Tweets <- searchTwitter("#rstats",n=100)
 n <- length(Tweets)
 # Tweets[1:3]
 #
 #-------------------------------------------------------------------------------
 #Transforming Text
 #The tweets are first converted to a data frame and then to a corpus.
 df <- do.call("rbind", lapply(Tweets, as.data.frame))
 #dim(df)
 # Just in case twitter is off-line
 #df <-read.csv("UseRTweets.csv",header=TRUE,row.names=1)
 #head(df)
 #
 # Build a corpus, which is a collection of text documents
 # VectorSource specifies that the source is character vectors.
 myCorpus <- Corpus(VectorSource(df$text))

 #After that, the corpus needs a couple of transformations, including 
   #changing letters to lower case, 
   #removing punctuations/numbers and removing stop words. 
 #The general English stop-word list is tailored by 
 #adding "available" and "via" and removing "r".

 myCorpus <- tm_map(myCorpus, tolower)					# lower case
 myCorpus <- tm_map(myCorpus, removePunctuation)			# remove punctuation
 myCorpus <- tm_map(myCorpus, removeNumbers)				# remove numbers
 # keep "r" by removing it from stopwords
 myStopwords <- c(stopwords('english'), "available", "via")
 idx <- which(myStopwords == "r")
 myStopwords <- myStopwords[-idx]
 myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
 #----------------------------------------------------------------------------
 #Stemming Words
 #  In many cases, words need to be stemmed to retrieve their radicals. 
 #  For instance, "example" and "examples" are both stemmed to "exampl". 
 #  However, after that, one may want to complete the stems to their original 
 #  forms, so that the words would look "normal".

 dictCorpus <- myCorpus
 # stem words in a text document with the snowball stemmers,
 # which requires packages Snowball, RWeka, rJava, RWekajars
 myCorpus <- tm_map(myCorpus, stemDocument)

 #inspect(myCorpus[1:3])   # inspect the first three ``documents"
 #myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus) # stem completion
 #
 #
 #inspect(myCorpus[1:3])   #Print the first three documents in the built corpus.
 #----------------------------------------------------------------------------------------
 #Building a Document-Term Matrix
 myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
 # inspect(myDtm[266:270,31:40])
 #
 # findFreqTerms(myDtm, lowfreq=10)		#Frequent Terms and Associations
 # findAssocs(myDtm, 'analytics', 0.30)    # which words are associated with "analytics"?
 #-----------------------------------------------------------------------------------------
 #Build the word cloud
 #After building a document-term matrix, we can show the importance of 
 #words with a word cloud (also kown as a tag cloud) . 
 m <- as.matrix(myDtm)
 # calculate the frequency of words
 v <- sort(rowSums(m), decreasing=TRUE)
 myNames <- names(v)
 d <- data.frame(word=myNames, freq=v)
 # Plot the word cloud
 pal <- brewer.pal(6,"Dark2")
 pal <- pal[-(1)]	
 #random colors
 wordcloud(d$word,d$freq,c(4,1),2,,TRUE,TRUE,.15,pal)

	#------------------------------------------------------------
	# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
	# February 14, 2013
	# Joseph B. Rickert
	# Technical Marketing Manager
	#
	# BIG DATA with RevoScaleR
	#
	# Copyright: Revolution Analytics
	# This script is licensed under the GPLv2 license
	# http://www.gnu.org/licenses/gpl-2.0.html
	# ----------------------------------------------------------------------
	# LOOK AT THE MORTGATE DEFAULT DATA
	#------------------------------------------------------------------------
	dataDir <- "C:/Users/Joseph/Documents/DATA/Mortgage Data/mortDefault"
	mdata <- file.path(dataDir,"mortDefault.xdf")
	rxGetInfo(mdata,getVarInfo=TRUE)

	#-----------------------------------------------------------------------------------
	## Create a new data file having a variable with uniform random numbers
	# going from 1 to 10. This variable will be used to create the training and test
	# data sets.
	# A little note on how the random numbers are created:
	# A transform should work on an arbitrary chunk of data. Typically
	# RevoScaleR functions will test transforms on a small chunk before
	# fully processing. The internal variable (.rxNumRows) gives the size
	# of the chunk.

	rxDataStep(inData = mdata, outFile = "mortDefault2",
	transforms=list(urns = as.integer(runif(.rxNumRows,1,11))),
	overwrite=TRUE)
	rxGetInfo("mortDefault2",getVarInfo=TRUE,numRows=3)

	#
	#------------------------------------------------------------
	# KMEANS ANALYSIS
	#------------------------------------------------------------
	rxDataStep(inData="mortDefault2",outFile="mortDefault3",
	varsToDrop="default",
	overwrite=TRUE)
	rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5)

	form <- formula(~ creditScore + houseAge + yearsEmploy + ccDebt + year)
	md.km <- rxKmeans(formula=form,
	data = "mortDefault3",
	numClusters = 3,
	outFile = "mortDefault3",
	algorithm = "lloyd",
	overwrite=TRUE)
	rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5)
	md.km
	# Build a data frame to do a plot
	mdDf <- rxXdfToDataFrame(file="mortDefault3",
	rowSelection=urns == 5,
	maxRowsByCols = 1000)

	plot(mdDf[,1:4],col=mdDf$.rxCluster)
	title(main="Clusters in Mortgage Default Data",line=3)

	###### SCRIPT TO BUILD LOGISTIC REGRESSION MODEL TO PREDICT MORTGAGE DEFAULTS #####
	#---------------------------------------------------------------------------
	# Some subsidary functions
	#---------------------------------------------------------------------------
	# Function to compute a "long form" of the confusion matrix
	Cmatrix <- function(df){
	df <- as.data.frame(df)
	df$Result <- c("True Negative","False Negative","False Positive","True Positive")
	df$PCT <- round(df$Counts/sum(df$Counts),2)*100
	df$Rates <- round(c(df$Counts[1]/(df$Counts[1]+df$Counts[3]),
	df$Counts[2]/(df$Counts[2]+df$Counts[4]),
	df$Counts[3]/(df$Counts[1]+df$Counts[3]),
	df$Counts[4]/(df$Counts[2]+df$Counts[4])),2)
	names(df) <- c("Actual","Predicted","Counts","Results","Pct","Rates")
	return(df)
	}
	#------------------------------------------------------------------------------
	##### CREATE TRAINING AND TEST FILES
	#-----------------------------------
	#info <- rxGetInfo(mdata)
	#N <- info$numRows
	#

	#-------------------------------------------------------------------------------
	# BUILD THE TRAINING FILE
	#------------------------
	rxDataStepXdf(inFile = "mortDefault2",
	outFile = "mdTrain",
	rowSelection = urns < 9,
	transforms=list(CS = creditScore,
	YR = year,
	yrE = yearsEmploy,
	HA = houseAge,
	ccD = ccDebt),
	blocksPerRead=20,
	rowsPerRead=500000,
	overwrite=TRUE )

	rxGetInfo("mdTrain",getVarInfo=TRUE,numRows=5)
	rxHistogram(~default,data="mdTrain")
	#-------------------------
	# BUILD THE TEST FILE
	#-------------------------
	rxDataStepXdf(inFile = "mortDefault2",
	outFile = "mdTest",
	rowSelection = urns > 8,
	transforms=list(CS = creditScore,
	YR = year,
	yrE = yearsEmploy,
	HA = houseAge,
	ccD = ccDebt),
	blocksPerRead=20,
	rowsPerRead=500000,
	overwrite=TRUE )
	#
	rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
	rxHistogram(~default,data="mdTest")
	#---------------------------------------------------------------------------
	# BUILD A CLASSIFICATION MODEL USING LOGISTIC REGRESSION
	#---------------------------------------------------------------------------
	system.time(
	model <- rxLogit(default ~ F(houseAge) + F(year)+ creditScore + yearsEmploy + ccDebt,
	data="mdTrain",
	reportProgress=rxGetOption("reportProgress") )
	)
	#
	#Elapsed computation time: 21.533 secs.
	#user system elapsed
	#56.15 12.02 21.55


	#Elapsed computation time: 23.149 secs.
	#user system elapsed
	#56.81 10.58 23.17
	#Elapsed computation time: 24.384 secs.
	#user system elapsed
	#59.29 10.31 24.48

	summary(model)

	#----------------------------------------------------------------------
	# MAKE PREDICTIONS ON THE TEST DATA USING THE MODEL CREATED ABOVE
	#----------------------------------------------------------------------
	rxPredict(modelObject=model,data="mdTest",outData="mdTest",overwrite=TRUE,predVarNames="LogitPred")
	rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
	#rxSummary(~default_Pred,data="mdTest")
	# Add a new prediction variable
	rxDataStep(inData="mdTest",outFile="mdTest",
	transforms=list(LogitPred.L = as.logical(round(LogitPred))),
	overwrite=TRUE)
	#
	rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)

	#-------------------------------------------------------------------------------
	# GENERATE THE CONFUSION MATRIX
	#-------------------------------
	conMc <- rxCube(~ F(default):F(LogitPred.L),data="mdTest")
	Cmatrix(conMc)

	# Examine the performance of the model
	total.pct.correct <- round(100*(conMc$Counts[1]+conMc$Counts[4]) / sum(conMc$Counts),2)
	total.pct.correct
	#-----------------------------------------------------------------------------------
	# Generate the ROC Curve
	#
	rxRocCurve(actualVarName="default",predVarName="LogitPred",data="mdTest")
	#
	#-------------------------------------------------------------------------------------

	# BUILD A TREE MODEL
	system.time(
	model.tree <- rxDTree(default ~ HA + YR + CS + yrE + ccD,
	data="mdTrain",
	blocksPerRead = 1,
	maxDepth=5,
	reportProgress=rxGetOption("reportProgress") )
	)
	##

	#Elapsed time for RxDTreeBase: 89.545 secs.
	#
	#user system elapsed
	#245.13 12.50 89.57


	#Elapsed time for RxDTreeBase: 403.785 secs.
	# This was to fully build out the tree
	#user system elapsed
	#1092.37 75.89 403.83

	model.tree
	#
	#----------------------------------------------------------------
	# Plot the Tree
	plot(rxAddInheritance(model.tree),uniform=TRUE)
	text(rxAddInheritance(model.tree),digits=2)
	title(main="Classification Tree for Mortgage Data",
	sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
	#-------------------------------------------------------------------

	###### - END DEMO HERE - ###########
	##############################################################################
	# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
	# February 14, 2013
	# Joseph B. Rickert
	# Technical Marketing Manager
	#
	# ROLL with RATTLE
	#
	# Copyright: Revolution Analytics
	# This script is licensed under the GPLv2 license
	# http://www.gnu.org/licenses/gpl-2.0.html

	#################################################################################
	#
	library(rattle) # load the rattle package
	rattle() # start the rattle user interface
	#
	#data() # see what data sets are available in all of the loaded packages
	data(package="rattle") # see what data sets are availeble in rattle
	ls("package:rattle") # See what functions are in the Rattle package
	#lsf.str("package:rattle") # see what functions are in rattle
	#
	# THE FOLLOWING INSTRUCTIONS SHOULD BE HELPFUL FOR EXPLORING THE RATTLE GUI
	#
	# LOAD THE WEATHER DATA SET.
	# The weather data set consists of observations made at a weather monitoring station
	# in Canberra, Australia. Each ovservation describs weather conditions on a particular day.
	# See page 25 of Gram Willians' Data Mining with Rattle, The Art of Excavating Data for
	# Knowlwdge Discovery, Springer 2011
	#
	# Go to the Data Tab and click on Execute
	# Rattle will ask if you want to use the weather data as default. Click yes.
	#
	# SUMMARY STATISTICS
	# Go to the Explore Tab
	# Select summary and basics
	# Hit Execute
	#
	# SCATTER PLOTS
	# Go to the Explore Tab
	# Select Distributions
	# Click on Execute
	#
	# LOOK AT A SINGLE VARIABLE
	# Go to Explore Tab
	# Select RainTomorrow Bar Plot
	# Hit Execute
	# This produces a bar plot of the target variable RainTomorrow
	# 84% of the observations have no rain
	# A model that always predicts no rain should be about 84% accurate
	#
	# INVESTIGATE MULTIPLE VARIABLES
	# Go to the Explore Tab
	# In the upper panel select Box Plot and Histogram for both
	# MaxTemp
	# Sunshine
	# Click Execute
	#
	# Boxplots top left: Temperature generally higher day before it rains
	# Boxplots top right: Less sunshine day before it rains
	#
	# CORRELATIONS
	# Go to the Explore Tab
	# Un select any variables that may be selected
	# Select Correlation
	# Click on Execute
	#
	#
	# INTERACTIVELY EXPLORE DATA
	# Select Interactive and then Lattiscist
	# In bottom center panel
	# Select MaxTemp for y axis and
	# Select Sunshine for x axis
	# Place crosshair on outlier and right click
	#
	# BUILD A TREE MODEL
	# Go to Model Tab
	# Select Tree
	# Click Execute
	# Click on Draw button to get the graph
	# Click on Rules button to see rules
	# Select Log Tab to look at R code
	#
	# EVALUATE THE MODEL
	# Go to the Evaluate tab
	# Select
	# Type = Error Matrix
	# Model = Tree
	# Data = Testing
	# Click on Execute
	#
	# Error matrix for the Decision Tree model on weather.csv [test] (counts):
	#
	#Predicted
	#Actual No Yes
	#No 35 6 False positive rate = FP/N = 6/(35+6) = .146 = negatives incorrectly classified / total negatives
	#Yes 5 10 True positive rate = TP/P = 10/(10+5) = .667 = positives correctly classified / total positives
	# = sensitivity = recall = hit rate
	# True negative rate = TN / (FP + TN) = 1 - FP rate = .854
	# = specificity
	#
	# False positives = 6 = Type I Error (Test rejects true null hypothesis)
	# False negatives = 5 = Type II Error (Test fails to reject false null hypothesis)

	#Error matrix for the Decision Tree model on weather.csv [test] (%):
	#
	#Predicted
	#Actual No Yes
	#No 62 11 62% (35/56) of cases model predicts it won't rain and it didn't
	#Yes 9 18 18% (10/56) of cases model predicts it would rain and it did
	# Accracy of test = 62% + 18% = 80%
	#
	#Overall error: 0.1964286