Skip to content

Instantly share code, notes, and snippets.

@shaheeng
Created April 11, 2016 22:06
Show Gist options
  • Save shaheeng/0d44595a6d8fb9df2578c68c842f2d43 to your computer and use it in GitHub Desktop.
Save shaheeng/0d44595a6d8fb9df2578c68c842f2d43 to your computer and use it in GitHub Desktop.
# Purpose: Build classification models to predict wine quality
# Use three different classification algorithms and compare their accuracies
# Author : Shaheen Gauher - Data Scientist at Microsoft
# Note: The code below requires MRS (Microsoft R Server, formally Revolution R Enterprise (RRE))
# http://blog.revolutionanalytics.com/2016/01/microsoft-r-open.html
# MRS can be downloaded from https://www.dreamspark.com/Product/Product.aspx?productid=105
##download data from
#https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
data_wine = read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
header=T,sep=";",na.strings="NA") #1599 12
class(data_wine) #"data.frame"
#Relabel quality ratings as follows
#3,4,5 as Low
#6 as Med
#7,8 as High
data_wine$qualityV2 = ifelse(data_wine$quality <=5, 'Low','None')
TorF = data_wine$quality == 6
data_wine$qualityV2[TorF] = 'Med'
TorF = data_wine$quality > 6
data_wine$qualityV2[TorF] = 'High'
data_wine$quality = NULL
names(data_wine)[names(data_wine)=='qualityV2'] = 'quality'
#=============================
pathc = getwd() #working directory
#convert data frame to xdf object using rxDataStep()
dataclassi_xdf = file.path(pathc,'dataclassi_xdf.xdf')
data_classi = rxDataStep(inData = data_wine, outFile = dataclassi_xdf ,
rowsPerRead=500, overwrite=TRUE, reportProgress=0)
class(data_classi) # "RxXdfData"
#make a new column factorQuality from quality col -- make it categorical
rxFactors(inData = data_classi, outFile = data_classi, overwrite = TRUE,
factorInfo = list(factorQuality = list(varName = "quality")),reportProgress=0)
#can remove the col quality now
ColsToKeep = setdiff(names(data_classi),c('quality'))
data_classi = rxDataStep(inData = data_classi, outFile = 'data_classi_temp.xdf',varsToKeep = ColsToKeep, overwrite = TRUE)
#rename the label col 'factorQuality' as 'LabelsCol'
names(data_classi)[names(data_classi)=='factorQuality'] = 'LabelsCol'
#use rxDataStep() to create a col called 'splitcol' to use for splitting
rxDataStep(inData=data_classi,outFile=data_classi,transforms=list(splitcol=factor(rbinom(.rxNumRows,1,0.8),labels=c('test','train'))),overwrite=T)
#split using the col "splitcol"
#rxSplit() -- Splits an input '.xdf' file or data frame into multiple '.xdf' files or a list of data frames.
listofxdfs = rxSplit(data_classi,outFileBase='data_classi_split',outFileSuffixes=c("Train", "Test"),splitByFactor = "splitcol",overwrite=T )
trainingdata = listofxdfs[[2]]
testdata = listofxdfs[[1]]
#collect names of columns (features) to be used for modelling
allfeatures = setdiff(names(data_classi),c('LabelsCol','splitcol'))
#create formula for modelling
formula = as.formula(paste('LabelsCol',paste(allfeatures,collapse=' + '),sep=' ~ '))
formula
Algorithms <- c("Decision Forest Classification",
"Boosted Decision Tree Classification",
"Decision Tree Classification")
################################################################################
## Decision forest modeling
################################################################################
#Decision Forest
#using rxDForest() to build ML model
DForest_model <- rxDForest(formula = formula,
data = trainingdata,
seed = 10,
cp = 0.01,
nTree = 50,
mTry = 2,
overwrite = TRUE,
reportProgress = 0)
DForest_model
class(DForest_model) #"rxDForest"
################################################################################
## Boosted tree modeling
################################################################################
BoostedTree_model = rxBTrees(formula = formula,
data = trainingdata,
learningRate = 0.2,
minSplit = 10,
minBucket = 10,
nTree = 100,
lossFunction = "multinomial",
reportProgress = 0)
BoostedTree_model
class(BoostedTree_model)
################################################################################
## Decision Tree Modelling
################################################################################
#rxDTree
DTree_model = rxDTree(formula = formula,
data = trainingdata,
minSplit = 10,
minBucket = 10,
nTree = 100,
reportProgress = 0)
DTree_model
class(DTree_model)
################################################################################
#=======================================================
# Compute the accuracy of the trained models and how it performs on the test data
#=======================================================
#Function to compute accuracy of the trained model on the given data
computeaccuracy <- function(ML_model,scoredata){
if(file.exists("modelout_xdf.xdf") ) { file.remove("modelout_xdf.xdf") }
modelout_xdf = RxXdfData("modelout_xdf.xdf") #initialise xdf object
rxPredict(ML_model, data = scoredata, outData = modelout_xdf, overwrite = TRUE,
writeModelVars = TRUE, reportProgress = 0)
#head(modelout_xdf) #contains the actual and predicted cols
#get the columns "LabelsCol_Pred" and "LabelsCol" from modelout_xdf
results_model_df = rxDataStep(inData=modelout_xdf,outFile=NULL,varsToKeep=c('LabelsCol_Pred','LabelsCol'),reportProgress = 0)
head(results_model_df)
actual = results_model_df$LabelsCol
predicted = results_model_df$LabelsCol_Pred
cm = as.matrix(table(Actual=actual, Predicted=predicted)) #create a confusion matrix
cm
accuracy = sum(diag(cm)) / sum(cm)
accuracy
#cat('The model produced an accuracy = ',accuracy,'\n')
return(accuracy)
}
# to invoke function:
# computeaccuracy(ML_model,testdata)
# computeaccuracy(ML_model,trainingdata)
#====================================================
ML_model = DForest_model
cat('For Decision Forest: accuracy = ',computeaccuracy(ML_model,trainingdata),'\n')
cat('For Decision Forest: accuracy on test data = ',computeaccuracy(ML_model,testdata),'\n')
#====================================================
ML_model = BoostedTree_model
cat('For Boosted tree: accuracy = ',computeaccuracy(ML_model,trainingdata),'\n')
cat('For Boosted tree: accuracy on test data = ',computeaccuracy(ML_model,testdata),'\n')
#====================================================
ML_model = DTree_model
cat('For Decision Tree: accuracy = ',computeaccuracy(ML_model,trainingdata),'\n')
cat('For Decision Tree: accuracy on test data = ',computeaccuracy(ML_model,testdata),'\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment