shaheeng · April 11, 2016 22:06
diff --git a/predictwinequality_3labels_MSR.R b/predictwinequality_3labels_MSR.R
 # Purpose: Build classification models to predict wine quality
 #          Use three different classification algorithms and compare their accuracies
 # Author : Shaheen Gauher - Data Scientist at Microsoft

 # Note: The code below requires MRS (Microsoft R Server, formally Revolution R Enterprise (RRE)) 
 # http://blog.revolutionanalytics.com/2016/01/microsoft-r-open.html 
 # MRS can be downloaded from https://www.dreamspark.com/Product/Product.aspx?productid=105

 ##download data from
 #https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
 data_wine = read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
                       header=T,sep=";",na.strings="NA") #1599 12
 class(data_wine)  #"data.frame"

 #Relabel quality ratings as follows
 #3,4,5 as Low
 #6 as Med
 #7,8 as High

 data_wine$qualityV2 = ifelse(data_wine$quality <=5, 'Low','None')
 TorF = data_wine$quality == 6
 data_wine$qualityV2[TorF] = 'Med'
 TorF = data_wine$quality > 6
 data_wine$qualityV2[TorF] = 'High'

 data_wine$quality = NULL
 names(data_wine)[names(data_wine)=='qualityV2'] = 'quality'
 #=============================

 pathc = getwd()  #working directory
 #convert data frame to xdf object using rxDataStep() 
 dataclassi_xdf = file.path(pathc,'dataclassi_xdf.xdf') 
 data_classi = rxDataStep(inData = data_wine, outFile = dataclassi_xdf ,
                         rowsPerRead=500, overwrite=TRUE, reportProgress=0)
 class(data_classi)  # "RxXdfData"

 #make a new column factorQuality from quality col -- make it categorical
 rxFactors(inData = data_classi, outFile = data_classi, overwrite = TRUE,
          factorInfo = list(factorQuality = list(varName = "quality")),reportProgress=0)

 #can remove the col quality now
 ColsToKeep = setdiff(names(data_classi),c('quality'))

 data_classi = rxDataStep(inData = data_classi, outFile = 'data_classi_temp.xdf',varsToKeep = ColsToKeep, overwrite = TRUE)

 #rename the label col 'factorQuality' as 'LabelsCol'
 names(data_classi)[names(data_classi)=='factorQuality'] = 'LabelsCol'

 #use rxDataStep() to create a col called 'splitcol' to use for splitting
 rxDataStep(inData=data_classi,outFile=data_classi,transforms=list(splitcol=factor(rbinom(.rxNumRows,1,0.8),labels=c('test','train'))),overwrite=T)  

 #split using the col "splitcol" 
 #rxSplit() -- Splits an input '.xdf' file or data frame into multiple '.xdf' files or a list of data frames.
 listofxdfs = rxSplit(data_classi,outFileBase='data_classi_split',outFileSuffixes=c("Train", "Test"),splitByFactor = "splitcol",overwrite=T )

 trainingdata = listofxdfs[[2]]
 testdata     = listofxdfs[[1]]

 #collect names of columns (features) to be used for modelling
 allfeatures = setdiff(names(data_classi),c('LabelsCol','splitcol'))

 #create formula for modelling
 formula = as.formula(paste('LabelsCol',paste(allfeatures,collapse=' + '),sep=' ~ '))  
 formula

 Algorithms <- c("Decision Forest Classification",
                "Boosted Decision Tree Classification",
                "Decision Tree Classification")

 ################################################################################
 ## Decision forest modeling
 ################################################################################
 #Decision Forest
 #using rxDForest() to build ML model
 DForest_model <- rxDForest(formula = formula, 
                           data = trainingdata, 
                           seed = 10, 
                           cp = 0.01, 
                           nTree = 50, 
                           mTry = 2,                   
                           overwrite = TRUE, 
                           reportProgress = 0)
 DForest_model
 class(DForest_model) #"rxDForest" 

 ################################################################################
 ## Boosted tree modeling
 ################################################################################
 BoostedTree_model = rxBTrees(formula = formula,
                             data = trainingdata,
                             learningRate = 0.2,
                             minSplit = 10,
                             minBucket = 10,
                             nTree = 100,
                             lossFunction = "multinomial",
                             reportProgress = 0)
 BoostedTree_model
 class(BoostedTree_model)
 ################################################################################
 ## Decision Tree Modelling
 ################################################################################

 #rxDTree
 DTree_model = rxDTree(formula = formula,
                      data = trainingdata,
                      minSplit = 10,
                      minBucket = 10,
                      nTree = 100,
                      reportProgress = 0)
 DTree_model
 class(DTree_model)

 ################################################################################
 #=======================================================
 # Compute the accuracy of the trained models and how it performs on the test data
 #=======================================================

 #Function to compute accuracy of the trained model on the given data
 computeaccuracy <- function(ML_model,scoredata){
  if(file.exists("modelout_xdf.xdf") ) {  file.remove("modelout_xdf.xdf") }
  modelout_xdf = RxXdfData("modelout_xdf.xdf") #initialise xdf object 
  rxPredict(ML_model, data = scoredata, outData = modelout_xdf, overwrite = TRUE,
            writeModelVars = TRUE, reportProgress = 0)
  #head(modelout_xdf)  #contains the actual and predicted cols
  
  #get the columns "LabelsCol_Pred" and "LabelsCol" from modelout_xdf
  results_model_df = rxDataStep(inData=modelout_xdf,outFile=NULL,varsToKeep=c('LabelsCol_Pred','LabelsCol'),reportProgress = 0)
  head(results_model_df)
  
  actual    = results_model_df$LabelsCol
  predicted = results_model_df$LabelsCol_Pred
  cm = as.matrix(table(Actual=actual, Predicted=predicted)) #create a confusion matrix
  cm
  accuracy = sum(diag(cm)) / sum(cm)
  accuracy
  #cat('The model produced an accuracy = ',accuracy,'\n')
  return(accuracy)
 }
 # to invoke function:
 # computeaccuracy(ML_model,testdata)
 # computeaccuracy(ML_model,trainingdata)
 #====================================================

 ML_model = DForest_model

 cat('For Decision Forest: accuracy = ',computeaccuracy(ML_model,trainingdata),'\n')
 cat('For Decision Forest: accuracy on test data = ',computeaccuracy(ML_model,testdata),'\n')
 #====================================================
 ML_model = BoostedTree_model

 cat('For Boosted tree: accuracy = ',computeaccuracy(ML_model,trainingdata),'\n')
 cat('For Boosted tree: accuracy on test data = ',computeaccuracy(ML_model,testdata),'\n')
 #====================================================
 ML_model = DTree_model

 cat('For Decision Tree: accuracy = ',computeaccuracy(ML_model,trainingdata),'\n')
 cat('For Decision Tree: accuracy on test data = ',computeaccuracy(ML_model,testdata),'\n')
	# Purpose: Build classification models to predict wine quality
	# Use three different classification algorithms and compare their accuracies
	# Author : Shaheen Gauher - Data Scientist at Microsoft

	# Note: The code below requires MRS (Microsoft R Server, formally Revolution R Enterprise (RRE))
	# http://blog.revolutionanalytics.com/2016/01/microsoft-r-open.html
	# MRS can be downloaded from https://www.dreamspark.com/Product/Product.aspx?productid=105

	##download data from
	#https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
	data_wine = read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
	header=T,sep=";",na.strings="NA") #1599 12
	class(data_wine) #"data.frame"

	#Relabel quality ratings as follows
	#3,4,5 as Low
	#6 as Med
	#7,8 as High

	data_wine$qualityV2 = ifelse(data_wine$quality <=5, 'Low','None')
	TorF = data_wine$quality == 6
	data_wine$qualityV2[TorF] = 'Med'
	TorF = data_wine$quality > 6
	data_wine$qualityV2[TorF] = 'High'

	data_wine$quality = NULL
	names(data_wine)[names(data_wine)=='qualityV2'] = 'quality'
	#=============================

	pathc = getwd() #working directory
	#convert data frame to xdf object using rxDataStep()
	dataclassi_xdf = file.path(pathc,'dataclassi_xdf.xdf')
	data_classi = rxDataStep(inData = data_wine, outFile = dataclassi_xdf ,
	rowsPerRead=500, overwrite=TRUE, reportProgress=0)
	class(data_classi) # "RxXdfData"

	#make a new column factorQuality from quality col -- make it categorical
	rxFactors(inData = data_classi, outFile = data_classi, overwrite = TRUE,
	factorInfo = list(factorQuality = list(varName = "quality")),reportProgress=0)

	#can remove the col quality now
	ColsToKeep = setdiff(names(data_classi),c('quality'))

	data_classi = rxDataStep(inData = data_classi, outFile = 'data_classi_temp.xdf',varsToKeep = ColsToKeep, overwrite = TRUE)

	#rename the label col 'factorQuality' as 'LabelsCol'
	names(data_classi)[names(data_classi)=='factorQuality'] = 'LabelsCol'

	#use rxDataStep() to create a col called 'splitcol' to use for splitting
	rxDataStep(inData=data_classi,outFile=data_classi,transforms=list(splitcol=factor(rbinom(.rxNumRows,1,0.8),labels=c('test','train'))),overwrite=T)

	#split using the col "splitcol"
	#rxSplit() -- Splits an input '.xdf' file or data frame into multiple '.xdf' files or a list of data frames.
	listofxdfs = rxSplit(data_classi,outFileBase='data_classi_split',outFileSuffixes=c("Train", "Test"),splitByFactor = "splitcol",overwrite=T )

	trainingdata = listofxdfs[[2]]
	testdata = listofxdfs[[1]]

	#collect names of columns (features) to be used for modelling
	allfeatures = setdiff(names(data_classi),c('LabelsCol','splitcol'))

	#create formula for modelling
	formula = as.formula(paste('LabelsCol',paste(allfeatures,collapse=' + '),sep=' ~ '))
	formula

	Algorithms <- c("Decision Forest Classification",
	"Boosted Decision Tree Classification",
	"Decision Tree Classification")

	################################################################################
	## Decision forest modeling
	################################################################################
	#Decision Forest
	#using rxDForest() to build ML model
	DForest_model <- rxDForest(formula = formula,
	data = trainingdata,
	seed = 10,
	cp = 0.01,
	nTree = 50,
	mTry = 2,
	overwrite = TRUE,
	reportProgress = 0)
	DForest_model
	class(DForest_model) #"rxDForest"

	################################################################################
	## Boosted tree modeling
	################################################################################
	BoostedTree_model = rxBTrees(formula = formula,
	data = trainingdata,
	learningRate = 0.2,
	minSplit = 10,
	minBucket = 10,
	nTree = 100,
	lossFunction = "multinomial",
	reportProgress = 0)
	BoostedTree_model
	class(BoostedTree_model)
	################################################################################
	## Decision Tree Modelling
	################################################################################

	#rxDTree
	DTree_model = rxDTree(formula = formula,
	data = trainingdata,
	minSplit = 10,
	minBucket = 10,
	nTree = 100,
	reportProgress = 0)
	DTree_model
	class(DTree_model)

	################################################################################
	#=======================================================
	# Compute the accuracy of the trained models and how it performs on the test data
	#=======================================================

	#Function to compute accuracy of the trained model on the given data
	computeaccuracy <- function(ML_model,scoredata){
	if(file.exists("modelout_xdf.xdf") ) { file.remove("modelout_xdf.xdf") }
	modelout_xdf = RxXdfData("modelout_xdf.xdf") #initialise xdf object
	rxPredict(ML_model, data = scoredata, outData = modelout_xdf, overwrite = TRUE,
	writeModelVars = TRUE, reportProgress = 0)
	#head(modelout_xdf) #contains the actual and predicted cols

	#get the columns "LabelsCol_Pred" and "LabelsCol" from modelout_xdf
	results_model_df = rxDataStep(inData=modelout_xdf,outFile=NULL,varsToKeep=c('LabelsCol_Pred','LabelsCol'),reportProgress = 0)
	head(results_model_df)

	actual = results_model_df$LabelsCol
	predicted = results_model_df$LabelsCol_Pred
	cm = as.matrix(table(Actual=actual, Predicted=predicted)) #create a confusion matrix
	cm
	accuracy = sum(diag(cm)) / sum(cm)
	accuracy
	#cat('The model produced an accuracy = ',accuracy,'\n')
	return(accuracy)
	}
	# to invoke function:
	# computeaccuracy(ML_model,testdata)
	# computeaccuracy(ML_model,trainingdata)
	#====================================================

	ML_model = DForest_model

	cat('For Decision Forest: accuracy = ',computeaccuracy(ML_model,trainingdata),'\n')
	cat('For Decision Forest: accuracy on test data = ',computeaccuracy(ML_model,testdata),'\n')
	#====================================================
	ML_model = BoostedTree_model

	cat('For Boosted tree: accuracy = ',computeaccuracy(ML_model,trainingdata),'\n')
	cat('For Boosted tree: accuracy on test data = ',computeaccuracy(ML_model,testdata),'\n')
	#====================================================
	ML_model = DTree_model

	cat('For Decision Tree: accuracy = ',computeaccuracy(ML_model,trainingdata),'\n')
	cat('For Decision Tree: accuracy on test data = ',computeaccuracy(ML_model,testdata),'\n')
No results found