Skip to content

Instantly share code, notes, and snippets.

@mspan
Last active August 29, 2015 13:56
Show Gist options
  • Save mspan/9315961 to your computer and use it in GitHub Desktop.
Save mspan/9315961 to your computer and use it in GitHub Desktop.
R Random Forest Tutorial with cell phone data (Connecticut R Users Group Meeting, March 4, 2014)
library(randomForest)
# for reference, how to download cleaned up dataset we'll be using.
# url = "https://spark-public.s3.amazonaws.com/dataanalysis/samsungData.rda"
# destfile = "./samsungData2.rda"
# download.file(url, destfile, method="curl", quiet = FALSE, mode = "wb",cacheOK = TRUE)
load("~/Dropbox/random_phone_tutorial/samsungData.rda")
#create a data.frame
df <- data.frame(samsungData)
sum(complete.cases(df)) #quick check on completeness of data ... NA's
#lets see how many different subjects there are:
all_subjects =unique(df$subject)
print(all_subjects)
num_subjects = NROW(all_subjects)
print(num_subjects)
table(df$subject)
barplot(table(df$subject), xlab = "subject", ylab="number of samples")
df$activity = as.factor(df$activity)
#do some quick exploratory plots
pairs(activity ~ df$tBodyAcc.mean...X + df$tBodyAcc.mean...Y + df$tBodyAcc.mean...Z, data = df)
pairs(activity ~ tBodyGyro.mean...X + tBodyGyro.mean...Y + tBodyGyro.mean...Z, data = df)
plot(df$activity)
#... TODO more plots...
#create test and training sets based upon subjects involved in the experiment
testset<-samsungData[samsungData$subject %in% c(27, 28, 29, 30),]
testset <-data.frame(testset)
testset$subject =NULL
trainset <- samsungData[!samsungData$subject %in% c(27, 28, 29, 30),]
trainset <- data.frame(trainset)
trainset$subject =NULL
#perform random forest modeling
set.seed(100)
#make a copy of the test set results to initialize the dataframe to store the predicted results. This dataframe will be updated by ytest output from the randomForest function call below.
#names(testset[563])
#predictionResults = as.factor(testset[,563])
names(testset[562])
predictionResults = as.factor(testset[,562])
#explicity show default number of variables randomly sampled as candidates at each split.
numvars_for_split = floor(sqrt(ncol(trainset)-2))
num_trees_to_grow = 300 # default is 500 ... same results so decreased for demo
# generate random forest
# based on system.time() , about 140 seconds to run on Mike's laptop
# note: here is an example call if I did not drop the subject column
#my_rf <- randomForest(as.factor(activity) ~ . - subject, data = trainset, xtest = testset[,1:561], ytest=predictionResults, keep.forest=TRUE, replace=TRUE, ntree = num_trees_to_grow, mtry=numvars_for_split)
my_rf <- randomForest(as.factor(activity) ~ ., data = trainset, xtest = testset[,1:561], ytest=predictionResults, keep.forest=TRUE, replace=TRUE, ntree = num_trees_to_grow, mtry=numvars_for_split)
#detour: since we saved the trees, can run other predictions ... repeating smae data here though
# can also combine , grow trees later
res <- predict(my_rf, testset[,1:561])
NROW(res)
#now back to main results
test_confusion_table = my_rf$test$confusion
print(test_confusion_table)
#calculate overall misclassification of predicitons from the confusion table
misclass_pred <- sum(test_confusion_table[row(test_confusion_table) != col(test_confusion_table)]) / sum(test_confusion_table)
print(misclass_pred)
#plot the OOB error rate versus increasing number of trees
mycols=c(1:10)
plot(my_rf,col=mycols, main = "OOB error rate as number of trees increases", lwd =1.5)
legend("right", colnames(my_rf$err.rate),cex=0.9,fill = mycols)
#plot the variable importance, using mean decrease gini impurity
varImpPlot(my_rf, n.var = 10, main="Variable Importance as Measured by a Random Forest")
#for reference, plot entire importance vector
plot(my_rf$importance)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment