-
-
Save mick001/ac92e7c017aecff216fd to your computer and use it in GitHub Desktop.
# Load the raw training data and replace missing values with NA | |
training.data.raw <- read.csv('train.csv',header=T,na.strings=c("")) | |
# Output the number of missing values for each column | |
sapply(training.data.raw,function(x) sum(is.na(x))) | |
# Quick check for how many different values for each feature | |
sapply(training.data.raw, function(x) length(unique(x))) | |
# A visual way to check for missing data | |
library(Amelia) | |
missmap(training.data.raw, main = "Missing values vs observed") | |
# Subsetting the data | |
data <- subset(training.data.raw,select=c(2,3,5,6,7,8,10,12)) | |
# Substitute the missing values with the average value | |
data$Age[is.na(data$Age)] <- mean(data$Age,na.rm=T) | |
# R should automatically code Embarked as a factor(). A factor is R's way of dealing with | |
# categorical variables | |
is.factor(data$Sex) # Returns TRUE | |
is.factor(data$Embarked) # Returns TRUE | |
# Check categorical variables encoding for better understanding of the fitted model | |
contrasts(data$Sex) | |
contrasts(data$Embarked) | |
# Remove rows (Embarked) with NAs | |
data <- data[!is.na(data$Embarked),] | |
rownames(data) <- NULL | |
# Train test splitting | |
train <- data[1:800,] | |
test <- data[801:889,] | |
# Model fitting | |
model <- glm(Survived ~.,family=binomial(link='logit'),data=train) | |
summary(model) | |
# Analysis of deviance | |
anova(model,test="Chisq") | |
# McFadden R^2 | |
library(pscl) | |
pR2(model) | |
#------------------------------------------------------------------------------- | |
# MEASURING THE PREDICTIVE ABILITY OF THE MODEL | |
# If prob > 0.5 then 1, else 0. Threshold can be set for better results | |
fitted.results <- predict(model,newdata=subset(test,select=c(2,3,4,5,6,7,8)),type='response') | |
fitted.results <- ifelse(fitted.results > 0.5,1,0) | |
misClasificError <- mean(fitted.results != test$Survived) | |
print(paste('Accuracy',1-misClasificError)) | |
# Confusion matrix | |
library(caret) | |
confusionMatrix(data=fitted.results, reference=test$Survived) | |
library(ROCR) | |
# ROC and AUC | |
p <- predict(model, newdata=subset(test,select=c(2,3,4,5,6,7,8)), type="response") | |
pr <- prediction(p, test$Survived) | |
# TPR = sensitivity, FPR=specificity | |
prf <- performance(pr, measure = "tpr", x.measure = "fpr") | |
plot(prf) | |
auc <- performance(pr, measure = "auc") | |
auc <- [email protected][[1]] | |
auc |
sir,
i got the same accuracy as you did but
when i write code of confusion matrix it shows the following error in r
confusionMatrix(data=fitted.results, reference=test$Survived)
Error in confusionMatrix(data = fitted.results, reference = test$Survived) :
unused arguments (data = fitted.results, reference = test$Survived)
kindly reply
Hi, i managed to get the error corrected by putting it in a "table": see below
confusionMatrix(table(data=fitted.results, reference=test$Survived))
sir,
i got the same accuracy as you did but
when i write code of confusion matrix it shows the following error in rconfusionMatrix(data=fitted.results, reference=test$Survived)
Error in confusionMatrix(data = fitted.results, reference = test$Survived) :
unused arguments (data = fitted.results, reference = test$Survived)kindly reply
Hi, i managed to get the error corrected by putting it in a "table": see below
confusionMatrix(table(data=fitted.results, reference=test$Survived))
yes, it is working thanks :)
Pls help when i run the confusionMatrix, using table()) error msg i got was all arguments must have the same length. Pls help advise whats that problem
Fix for the error (data
and reference
should be factors with the same levels.)in
confusionMatrix(data=fitted.results, reference=test$Survived)
do:
install.packages("e1071")
confusionMatrix(data=as.factor(fitted.results), reference=as.factor(test$Survived))
thanks