Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save yanping/023ce8ae29b0f5976e99 to your computer and use it in GitHub Desktop.
Save yanping/023ce8ae29b0f5976e99 to your computer and use it in GitHub Desktop.
#------------------------------------------------------------
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
# BIG DATA with RevoScaleR
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html
# ----------------------------------------------------------------------
# LOOK AT THE MORTGATE DEFAULT DATA
#------------------------------------------------------------------------
dataDir <- "C:/Users/Joseph/Documents/DATA/Mortgage Data/mortDefault"
mdata <- file.path(dataDir,"mortDefault.xdf")
rxGetInfo(mdata,getVarInfo=TRUE)
#-----------------------------------------------------------------------------------
## Create a new data file having a variable with uniform random numbers
# going from 1 to 10. This variable will be used to create the training and test
# data sets.
# A little note on how the random numbers are created:
# A transform should work on an arbitrary chunk of data. Typically
# RevoScaleR functions will test transforms on a small chunk before
# fully processing. The internal variable (.rxNumRows) gives the size
# of the chunk.
rxDataStep(inData = mdata, outFile = "mortDefault2",
transforms=list(urns = as.integer(runif(.rxNumRows,1,11))),
overwrite=TRUE)
rxGetInfo("mortDefault2",getVarInfo=TRUE,numRows=3)
#
#------------------------------------------------------------
# KMEANS ANALYSIS
#------------------------------------------------------------
rxDataStep(inData="mortDefault2",outFile="mortDefault3",
varsToDrop="default",
overwrite=TRUE)
rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5)
form <- formula(~ creditScore + houseAge + yearsEmploy + ccDebt + year)
md.km <- rxKmeans(formula=form,
data = "mortDefault3",
numClusters = 3,
outFile = "mortDefault3",
algorithm = "lloyd",
overwrite=TRUE)
rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5)
md.km
# Build a data frame to do a plot
mdDf <- rxXdfToDataFrame(file="mortDefault3",
rowSelection=urns == 5,
maxRowsByCols = 1000)
plot(mdDf[,1:4],col=mdDf$.rxCluster)
title(main="Clusters in Mortgage Default Data",line=3)
###### SCRIPT TO BUILD LOGISTIC REGRESSION MODEL TO PREDICT MORTGAGE DEFAULTS #####
#---------------------------------------------------------------------------
# Some subsidary functions
#---------------------------------------------------------------------------
# Function to compute a "long form" of the confusion matrix
Cmatrix <- function(df){
df <- as.data.frame(df)
df$Result <- c("True Negative","False Negative","False Positive","True Positive")
df$PCT <- round(df$Counts/sum(df$Counts),2)*100
df$Rates <- round(c(df$Counts[1]/(df$Counts[1]+df$Counts[3]),
df$Counts[2]/(df$Counts[2]+df$Counts[4]),
df$Counts[3]/(df$Counts[1]+df$Counts[3]),
df$Counts[4]/(df$Counts[2]+df$Counts[4])),2)
names(df) <- c("Actual","Predicted","Counts","Results","Pct","Rates")
return(df)
}
#------------------------------------------------------------------------------
##### CREATE TRAINING AND TEST FILES
#-----------------------------------
#info <- rxGetInfo(mdata)
#N <- info$numRows
#
#-------------------------------------------------------------------------------
# BUILD THE TRAINING FILE
#------------------------
rxDataStepXdf(inFile = "mortDefault2",
outFile = "mdTrain",
rowSelection = urns < 9,
transforms=list(CS = creditScore,
YR = year,
yrE = yearsEmploy,
HA = houseAge,
ccD = ccDebt),
blocksPerRead=20,
rowsPerRead=500000,
overwrite=TRUE )
rxGetInfo("mdTrain",getVarInfo=TRUE,numRows=5)
rxHistogram(~default,data="mdTrain")
#-------------------------
# BUILD THE TEST FILE
#-------------------------
rxDataStepXdf(inFile = "mortDefault2",
outFile = "mdTest",
rowSelection = urns > 8,
transforms=list(CS = creditScore,
YR = year,
yrE = yearsEmploy,
HA = houseAge,
ccD = ccDebt),
blocksPerRead=20,
rowsPerRead=500000,
overwrite=TRUE )
#
rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
rxHistogram(~default,data="mdTest")
#---------------------------------------------------------------------------
# BUILD A CLASSIFICATION MODEL USING LOGISTIC REGRESSION
#---------------------------------------------------------------------------
system.time(
model <- rxLogit(default ~ F(houseAge) + F(year)+ creditScore + yearsEmploy + ccDebt,
data="mdTrain",
reportProgress=rxGetOption("reportProgress") )
)
#
#Elapsed computation time: 21.533 secs.
#user system elapsed
#56.15 12.02 21.55
#Elapsed computation time: 23.149 secs.
#user system elapsed
#56.81 10.58 23.17
#Elapsed computation time: 24.384 secs.
#user system elapsed
#59.29 10.31 24.48
summary(model)
#----------------------------------------------------------------------
# MAKE PREDICTIONS ON THE TEST DATA USING THE MODEL CREATED ABOVE
#----------------------------------------------------------------------
rxPredict(modelObject=model,data="mdTest",outData="mdTest",overwrite=TRUE,predVarNames="LogitPred")
rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
#rxSummary(~default_Pred,data="mdTest")
# Add a new prediction variable
rxDataStep(inData="mdTest",outFile="mdTest",
transforms=list(LogitPred.L = as.logical(round(LogitPred))),
overwrite=TRUE)
#
rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
#-------------------------------------------------------------------------------
# GENERATE THE CONFUSION MATRIX
#-------------------------------
conMc <- rxCube(~ F(default):F(LogitPred.L),data="mdTest")
Cmatrix(conMc)
# Examine the performance of the model
total.pct.correct <- round(100*(conMc$Counts[1]+conMc$Counts[4]) / sum(conMc$Counts),2)
total.pct.correct
#-----------------------------------------------------------------------------------
# Generate the ROC Curve
#
rxRocCurve(actualVarName="default",predVarName="LogitPred",data="mdTest")
#
#-------------------------------------------------------------------------------------
# BUILD A TREE MODEL
system.time(
model.tree <- rxDTree(default ~ HA + YR + CS + yrE + ccD,
data="mdTrain",
blocksPerRead = 1,
maxDepth=5,
reportProgress=rxGetOption("reportProgress") )
)
##
#Elapsed time for RxDTreeBase: 89.545 secs.
#
#user system elapsed
#245.13 12.50 89.57
#Elapsed time for RxDTreeBase: 403.785 secs.
# This was to fully build out the tree
#user system elapsed
#1092.37 75.89 403.83
model.tree
#
#----------------------------------------------------------------
# Plot the Tree
plot(rxAddInheritance(model.tree),uniform=TRUE)
text(rxAddInheritance(model.tree),digits=2)
title(main="Classification Tree for Mortgage Data",
sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#-------------------------------------------------------------------
###### - END DEMO HERE - ###########
#
#------------------------------------------------------------
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
# GETTING STARTED
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html
#---------------------------------------------------------------------
# Execute the following command to install all of the packages needed for the webinar
#install.packages(c( "ada","boot","caret","corrplot","doParallel","ellipse",
#"ISwR","partykit","pROC","rattle","RColorBrewer",
#"rpart","Snowball","ROCR","tm","twitteR","wordcloud"))
#
#----------------------------------------------------------------------
# A First look at R
# A simple regression example from
# Statistics and Computing, Introductory Statistics with R
# Peter Dalgaard, Springer 2002
##
library(ISwR) # Load a library
data() # Have a look at what data sets are available
data(thuesen) # Load thuesen into the environment
thuesen # Have a look at it
class(thuesen) # Find out what kind of object thuesen is
sapply(thuesen,class) # See what kinds of animal the variables are
#
plot(short.velocity ~ blood.glucose, data=thuesen) #plot the data using the formula interface
#
plot(thuesen$blood.glucose,thuesen$short.velocity) # plot the data by indexining into the data frame
#
model <- lm(short.velocity ~ blood.glucose, data=thuesen) # build a linear model
summary(model) # Look at the results
str(model) # Look at the structure of the model object
# Build a fancier plot
plot(x=thuesen$blood.glucose,
y=thuesen$short.velocity,
xlab="blood glucose (mmol / l)",
ylab = "circumferential shortening velocity (%/s)",
main = "Thuesen Data set",
col="blue",
pch=19
)
abline(model,col="red")
#
par(mfrow=c(2,2)) # Set up for multiple plots
plot(model, col="blue") # look at some diagnostics
#---------------------------------------------------------------------
#
# A FIRST LOOK AT FUNCTIONS
#
# Let's create a simple function
joe.stats <- function(data){
min <- min(data)
max <- max(data)
q <- quantile(data,probs=seq(0,1,0.25))
res <- list(min,max,q)
return(res)
}
attach(thuesen) # make the columns of thuesen available
# in the global environment as variables
joe.stats(blood.glucose) # Run our function
summary(blood.glucose) # R does it better
# Set up for later
rm(list=ls())
load("WEBINAR_2-14-13_Intro_R_DM_caret .RData")
#--------------------------------------------------------------------------------
#SOME ADDITIONAL ONLINE RESOURCES
#An Introduction to R
#Notes on R: A Programming Environment for Data Analysis and Graphics
#Version 2.15.2 (2012-10-26)
#http://cran.r-project.org/doc/manuals/R-intro.pdf
#
#Using R for Data Analysis and Graphics
#Introduction, Code and Commentary
#J H Maindonald
#http://cran.r-project.org/doc/contrib/usingR.pdf
#------------------------------------------------------------------------
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
#### BUILD A TREE MODEL WITH RPART AND EVALUATE #####
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html
#-------------------------------------------------------------------------
# This script divides the data into training, validation and testing data,
# builds two different decision trees (rpart) using the training data and
# evaluates their performance using the test data set
# An ROC curve is produced for the better model
#------------------------------------------------------------------------
library(rattle)
library(rpart)
library(ROCR)
library(caret)
# -----------------------------------------------------------------------
# Read in the data from disk
# name <- "weather.csv"
# path <- file.path(getwd(),name)
# weather <- read.csv(path,header=TRUE)
# Show weather on the IDE editor
data(weather)
head(weather)
#------------------------------------------------------------------------
# Select variables for the model
weather <- subset(weather,select=c(MinTemp:RainTomorrow))
set.seed(42) # Set seed
#-------------------------------------------------------------------------
# Determined the observations for the training,validate and test datasets.
N <- nrow(weather) # 366 observations
train <- sample(N, 0.8*N) # 292 observations
test <- setdiff(seq_len(N),train) # 74 observations not intrain
#-------------------------------------------------------------------------
# Build the model
M <- ncol(weather)
input <- names(weather)[1:(M-2)] # names of input variables
target <- "RainTomorrow" # name of target variable
form <- formula(RainTomorrow ~ .) # Describe the model to R
tree.m <- rpart(RainTomorrow ~ .,
data=weather[train, c(input,target)],
method="class",
parms=list(split="information"),
control=rpart.control(usesurrogate=0, maxsurrogate=0))
#---------------------------------------------------------------------------
# Look at the textual description of the tree.
tree.m # print the model
printcp(tree.m) # print the table of optimal prunings based on the complexity parameter
#----------------------------------------------------------------------------
# Plot the tree
drawTreeNodes(tree.m)
title(main="Weather Data tree.m",
sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#----------------------------------------------------------------------------
# Evaluate performance
# Run the tree model on the validate set
pred <- predict(tree.m, weather[test, c(input,target)], type="class")
levels(pred) <- c("Yes","No") # change order of levesl to match documentation for confusionMatrix
# Generate the confusion matrix
actual <- weather[test, c(input,target)]$RainTomorrow
levels(actual) <- c("Yes","No") # change order of levels to match documantation for confusion matrix
AP <- c("Predicted","Actual") # row names for CM
CM <- table(pred,actual,dnn=AP) # CM counts
confusionMatrix(CM) # from the caret package
?confusionMatrix # Look at meaning of confusionMatrix outputs
# Notes
# The\no-information rate"shown on the output is the largest proportion of the observed classes
# A one-sided hypothesis test is computed to evaluate whether the overall accuracy rate is greater
# than the rate of the largest class. This is helpful for data sets where there is a large imbalance
# between the classes.
#
# The kappa statistic yields a measure of how well the actual and predicted values agree
# See http://www.chestx-ray.com/statistics/kappa.html or
# http://en.wikipedia.org/wiki/Cohen%27s_kappa
#
# The null hypothesis for McNemar's chi squared test is that the actual and predicted
# probabilities are the same
# See http://en.wikipedia.org/wiki/McNemar%27s_test
#
#--------------------------------------------------------------------------------------------
# Try another model using different variables
form <- formula(RainTomorrow ~ Cloud9am + Pressure9am + WindDir9am + Temp9am + Humidity9am)
tree.m2 <- rpart(form,
data=weather[train, c(input,target)],
method="class",
parms=list(split="information"),
control=rpart.control(usesurrogate=2,
maxsurrogate=0,
minsplit=30,
maxdepth=20))
#----------------------------------------------------------------------------------------------
# Plot the new tree
drawTreeNodes(tree.m2)
title(main="Weather Data tree.m2",
sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
tree.mod.p <- as.party(tree.m2) # make the tree.mod object into a party object
plot(tree.mod.p)
#----------------------------------------------------------------------------
# Evaluate performance of the new model on the test set
pred2 <- predict(tree.m2, weather[test, c(input,target)], type="class")
levels(pred2) <- c("Yes","No")
CM2 <- table(pred2,actual,dnn=AP)
confusionMatrix(CM2)
# -----------------------------------------------------------------------------------
#
# GENERATE THE ROC CURVE FOR THE BEST MODEL
prROC <- predict(tree.m, weather[test, c(input,target)])[,2]
#
# Get vector RainTommorrow in test data set
testRT <- weather[test, c(input,target)]$RainTomorrow
pr <- prediction(prROC, testRT)
#------------------------------------------------------------------------------------
# Plot the ROC curve
plot(performance(pr, "tpr", "fpr"), col="#CC0000FF", lty=1, lwd=2,add=FALSE)
#fpr: False positive rate. P(Yhat = + | Y = -). Estimated as: FP/N.
#tpr: True positive rate. P(Yhat = + | Y = +). Estimated as: TP/P.
segments(0,0,1,1,col="blue",lwd=2)
# Add a legend to the plot.
legend("bottomright", c("tree.m"), col=rainbow(1, 1, .8), lty=1:1, title="Models", inset=c(0.05, 0.05))
# Add decorations to the plot.
title(main="ROC Curve weather.csv [test data]",
sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#
#------------------------------------------------------------------------------
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
# DATA MINING with CARET
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html
#------------------------------------------------------------------------------
# INTRODUCTION TO THE CARET PACKAGE
# caret is a feature rich package for doing data mining in R.
# This script explores caret's capabilities using data included in the
# package that was described in the paper:
# Hill et al "Impact of image segmentation on high-content
# screening data quality for SK-BR-3 cells"
# BMC fioinformatics (2007) vol 8 (1) pp. 340
#
# Background
# Well-segmented cells are cells for which location and size may be accurrately detremined
# through optical measurements. Cells that are not Well-segmented (WS) are said to be
# Poorly-segmented (PS).
#
# Problem
# Given a set of optical measurements can we predict which cells will be PS?
# This is a classic classification problem
#---------------------------
library(ada) # Boosting algorithms
library(caret)
library(rpart) # CART algorithm for decision trees
library(partykit) # Plotting trees
library(doParallel) # parallel processing
# by default
# Multicore functionality on Unix (single machine only)
# Snow functionality on Windows (cluster)
library(pROC) # plot the ROC curve
library(corrplot) # plot correlations
#---------------------------
# data(package="caret")
data(segmentationData) # Load the segmentation data set
dim(segmentationData)
head(segmentationData) # Have a look at the data
#[1] 2019 61
trainIndex <- createDataPartition(segmentationData$Case,p=.5,list=FALSE)
trainData <- segmentationData[trainIndex,]
dim(trainData)
#1010 61
testData <- segmentationData[-trainIndex,]
dim(testData)
#1009 61
#-------------------------------------------------------------------------------------
# VISUALIZE CORRELATIONS
trainV <- trainData[,4:61]
corrplot(cor(trainV),order="hclust",tl.cex=.5,method="ellipse")
#-----------------------------------------------------------------
# BUILD AN ADABOOST MODEL WITH ADA
form <- formula(Class ~ .)
control <- rpart.control(maxdepth=30, # the maximum depth of any node of the final tree
cp = 0.01, # complexity parameter. Any split that does not decrease the overall lack of fit by a factor of cp is not attempted.
minsplit=20, # the minimum number of observations that must exist in a node in order for a split to be attempted
xval=10) # number of cross-validations
ada.model <- ada(formula=form,
data=trainData,
control=control,
nu = .01, # shrinkage parameter for boosting
iter=50)
ada.model$model[[1]] # Look at the trees in the model
ada.model # look at the model performance
plot(ada.model,TRUE) # Plot error rate vs. iterations of the model
varplot(ada.model) # Variable importance plot
#----------------------------------------------------------------------
# FIND THE "BEST" MODEL
#
# This is an interesting model, but how do you select the best values for the
# for the three tuning parameters?
# nu
# iter
# maxdepth
#---------------------------------------------------------------------------------
# Algorithm for training the model:
# for each resampled data set do
# hold out some samples
# for each combination of the three tuning parameters
# do
# Fit the model on the resampled data set
# Predict the values of class on the hold out samples
# end
# Calculate AUC: the area under the ROC for each sample
# Select the combination of tuning parmeters that yields the best AUC
#
# caret provides the "train" function to do all of this
#
# The trainControl function to set the training method
# Note the default method of picking the best model is accuracy and Cohen's Kappa
#
#-----------------------------------------------------------------------------------
# Set up the parameters to run the boosting function
ctrl <- trainControl(method="repeatedcv", # use repeated 10fold cross validation
number=5, # the number of folds
repeats=2, # do 2 repititions of 5-fold cv
summaryFunction=twoClassSummary, # Use AUC to pick the best model
classProbs=TRUE)
# Use the expand.grid to specify the search space
# Note that the default search grid selects 3 values of each tuning parameter
#
grid <- expand.grid(.nu=c(.1,1), #
.iter=c(20,50),
.maxdepth=c(20,30)) #
#
set.seed(1)
#names(trainData)
trainX <-trainData[,4:61]
#-----------------------------------------------------------------
# PARALLEL COMPUTING
# vignette("gettingstartedParallel")
cl <- makeCluster(4) # Use this to manually create a cluster
# But, since I only have a single Windows machine
# all I am doing is passing the number of cores to use to
# registerDoParallel()
registerDoParallel(cl) # Registrer a parallel backend for train
getDoParWorkers()
system.time(ada.tune <- train(x=trainX,y=trainData$Class,
method = "ada",
metric = "ROC",
trControl = ctrl,
control=control,
tuneGrid=grid))
#
stopCluster(cl)
#user system elapsed
#14.33 0.02 206.25
#-------------------------------------------------------------------------------
# ADA RESULTS
ada.tune # Look at the results for the training grid
ada.tune$finalModel # Look at the performance of the final model
plot(ada.tune) # Plot the performance of the training models
#--------------------------------------------------------------------------------
# ADA PREDICTIONS
testX <- testData[,4:61]
ada.pred <- predict(ada.tune,testX)
#
confusionMatrix(ada.pred,testData$Class)
#-----------------------------------------------------------------
# DRAW THE ROC CURVE
# Use roc function from the pROC package
ada.probs <- predict(ada.tune,testX,type="prob")
ada.ROC <- roc(predictor=ada.probs$PS,
response=testData$Class,
levels=rev(levels(testData$Class)))
plot(ada.ROC,col=2)
ada.ROC$auc # Get the area under the curve
#------------------------------------------------------------------------------------
#
# SUPPORT VECTOR MACHINE MODEL
#
set.seed(1)
registerDoParallel(4,cores=4)
getDoParWorkers()
system.time(
svm.tune <- train(x=trainX,
y= trainData$Class,
method = "svmRadial",
tuneLength = 5, # 5 values of the cost function
preProc = c("center","scale"),
metric="ROC",
trControl=ctrl) # same as for ada above
)
#user system elapsed
#2.40 0.14 26.10
#--------------------------------------------------------------
# SVM RESULTS
svm.tune # Look at the results for the training grid
svm.tune$finalModel # Look at the performance of the final mode
plot(svm.tune,
metric="ROC",
scales=list(x=list(log=2)))
#---------------------------------------------------------------
# SVM PREDICTIONS
svm.pred <- predict(svm.tune,testX)
confusionMatrix(svm.pred,testData$Class)
#
#----------------------------------------------------------------
# COMPARE THE SVM AND ADA MODELS USING RESAMPLING
#
# Because we set the same seed before running the models we can compare the models using resampling
# See Hothorn at al, "The design and analysis of benchmark experiments"
# Journal of Computational and Graphical Statistics (2005) vol 14 (3) pp 675-699
# for comparing models using resampling.
#
# The resamples function in caret collates the resampling results from the two models
rValues <- resamples(list(svm=svm.tune,ada=ada.tune))
rValues$values # Look at the resample values
summary(rValues) # Summarize the resamples
#---------------------------------------------
xyplot(rValues,metric="ROC") # scatter plot
bwplot(rValues,metric="ROC") # boxplot
parallel(rValues,metric="ROC") # parallel plot
dotplot(rValues,metric="ROC") # dotplot
#
##############################################################################
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
# ROLL with RATTLE
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html
#################################################################################
#
library(rattle) # load the rattle package
rattle() # start the rattle user interface
#
#data() # see what data sets are available in all of the loaded packages
data(package="rattle") # see what data sets are availeble in rattle
ls("package:rattle") # See what functions are in the Rattle package
#lsf.str("package:rattle") # see what functions are in rattle
#
# THE FOLLOWING INSTRUCTIONS SHOULD BE HELPFUL FOR EXPLORING THE RATTLE GUI
#
# LOAD THE WEATHER DATA SET.
# The weather data set consists of observations made at a weather monitoring station
# in Canberra, Australia. Each ovservation describs weather conditions on a particular day.
# See page 25 of Gram Willians' Data Mining with Rattle, The Art of Excavating Data for
# Knowlwdge Discovery, Springer 2011
#
# Go to the Data Tab and click on Execute
# Rattle will ask if you want to use the weather data as default. Click yes.
#
# SUMMARY STATISTICS
# Go to the Explore Tab
# Select summary and basics
# Hit Execute
#
# SCATTER PLOTS
# Go to the Explore Tab
# Select Distributions
# Click on Execute
#
# LOOK AT A SINGLE VARIABLE
# Go to Explore Tab
# Select RainTomorrow Bar Plot
# Hit Execute
# This produces a bar plot of the target variable RainTomorrow
# 84% of the observations have no rain
# A model that always predicts no rain should be about 84% accurate
#
# INVESTIGATE MULTIPLE VARIABLES
# Go to the Explore Tab
# In the upper panel select Box Plot and Histogram for both
# MaxTemp
# Sunshine
# Click Execute
#
# Boxplots top left: Temperature generally higher day before it rains
# Boxplots top right: Less sunshine day before it rains
#
# CORRELATIONS
# Go to the Explore Tab
# Un select any variables that may be selected
# Select Correlation
# Click on Execute
#
#
# INTERACTIVELY EXPLORE DATA
# Select Interactive and then Lattiscist
# In bottom center panel
# Select MaxTemp for y axis and
# Select Sunshine for x axis
# Place crosshair on outlier and right click
#
# BUILD A TREE MODEL
# Go to Model Tab
# Select Tree
# Click Execute
# Click on Draw button to get the graph
# Click on Rules button to see rules
# Select Log Tab to look at R code
#
# EVALUATE THE MODEL
# Go to the Evaluate tab
# Select
# Type = Error Matrix
# Model = Tree
# Data = Testing
# Click on Execute
#
# Error matrix for the Decision Tree model on weather.csv [test] (counts):
#
#Predicted
#Actual No Yes
#No 35 6 False positive rate = FP/N = 6/(35+6) = .146 = negatives incorrectly classified / total negatives
#Yes 5 10 True positive rate = TP/P = 10/(10+5) = .667 = positives correctly classified / total positives
# = sensitivity = recall = hit rate
# True negative rate = TN / (FP + TN) = 1 - FP rate = .854
# = specificity
#
# False positives = 6 = Type I Error (Test rejects true null hypothesis)
# False negatives = 5 = Type II Error (Test fails to reject false null hypothesis)
#Error matrix for the Decision Tree model on weather.csv [test] (%):
#
#Predicted
#Actual No Yes
#No 62 11 62% (35/56) of cases model predicts it won't rain and it didn't
#Yes 9 18 18% (10/56) of cases model predicts it would rain and it did
# Accracy of test = 62% + 18% = 80%
#
#Overall error: 0.1964286
#------------------------------------------------------------
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
# JUST FOR FUN - BUILD A WORD CLOUD
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html
# From example at RDataMining
# http://www.rdatamining.com/examples/text-mining
# This page shows an example on text mining of Twitter
#-----------------------------------------------------------------------------
# Load the libraries necesary
library(twitteR) # twitteR provides access to Twitter data
library(tm) # tm provides functions for text mining
library(Snowball) # Wrappers for Weka Java stemming funcitons
library(wordcloud) # wordcloud visualizes the result with a word cloud
library(RColorBrewer) # provides the rainbow colors
#------------------------------------------------------------------------------
# retrieve the first 100 tweets (or all tweets if fewer than 100)
# from the user timeline of @rdatammining
#
Tweets <- searchTwitter("#rstats",n=100)
n <- length(Tweets)
# Tweets[1:3]
#
#-------------------------------------------------------------------------------
#Transforming Text
#The tweets are first converted to a data frame and then to a corpus.
df <- do.call("rbind", lapply(Tweets, as.data.frame))
#dim(df)
# Just in case twitter is off-line
#df <-read.csv("UseRTweets.csv",header=TRUE,row.names=1)
#head(df)
#
# Build a corpus, which is a collection of text documents
# VectorSource specifies that the source is character vectors.
myCorpus <- Corpus(VectorSource(df$text))
#After that, the corpus needs a couple of transformations, including
#changing letters to lower case,
#removing punctuations/numbers and removing stop words.
#The general English stop-word list is tailored by
#adding "available" and "via" and removing "r".
myCorpus <- tm_map(myCorpus, tolower) # lower case
myCorpus <- tm_map(myCorpus, removePunctuation) # remove punctuation
myCorpus <- tm_map(myCorpus, removeNumbers) # remove numbers
# keep "r" by removing it from stopwords
myStopwords <- c(stopwords('english'), "available", "via")
idx <- which(myStopwords == "r")
myStopwords <- myStopwords[-idx]
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
#----------------------------------------------------------------------------
#Stemming Words
# In many cases, words need to be stemmed to retrieve their radicals.
# For instance, "example" and "examples" are both stemmed to "exampl".
# However, after that, one may want to complete the stems to their original
# forms, so that the words would look "normal".
dictCorpus <- myCorpus
# stem words in a text document with the snowball stemmers,
# which requires packages Snowball, RWeka, rJava, RWekajars
myCorpus <- tm_map(myCorpus, stemDocument)
#inspect(myCorpus[1:3]) # inspect the first three ``documents"
#myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus) # stem completion
#
#
#inspect(myCorpus[1:3]) #Print the first three documents in the built corpus.
#----------------------------------------------------------------------------------------
#Building a Document-Term Matrix
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
# inspect(myDtm[266:270,31:40])
#
# findFreqTerms(myDtm, lowfreq=10) #Frequent Terms and Associations
# findAssocs(myDtm, 'analytics', 0.30) # which words are associated with "analytics"?
#-----------------------------------------------------------------------------------------
#Build the word cloud
#After building a document-term matrix, we can show the importance of
#words with a word cloud (also kown as a tag cloud) .
m <- as.matrix(myDtm)
# calculate the frequency of words
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
# Plot the word cloud
pal <- brewer.pal(6,"Dark2")
pal <- pal[-(1)]
#random colors
wordcloud(d$word,d$freq,c(4,1),2,,TRUE,TRUE,.15,pal)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment