Skip to content

Instantly share code, notes, and snippets.

@talayhan
Last active August 29, 2015 14:19
Show Gist options
  • Select an option

  • Save talayhan/a86de672d108bf557285 to your computer and use it in GitHub Desktop.

Select an option

Save talayhan/a86de672d108bf557285 to your computer and use it in GitHub Desktop.
R
# -------------------------------
# Homework 2 - Machine Learning
# Samet Sait Talayhan
# -------------------------------
# o / \ //\
# o |\___/| / \// \\
# /0 0 \__ / // | \ \
# / / \/_/ // | \ \
# @_^_@'/ \/_ // | \ \
# //_^_/ \/_ // | \ \
# ( //) | \/// | \ \
# ( / /) _|_ / ) // | \ _\
# ( // /) '/,_ _ _/ ( ; -. | _ _\.-~ .-~~~^-.
# (( / / )) ,-{ _ `-.|.-~-. .~ `.
# (( // / )) '/\ / ~-. _ .-~ .-~^-. \
# (( /// )) `. { } / \ \
# (( / )) .----~-.\ \-' .~ \ `. \^-.
# ///.----..> \ _ -~ `. ^-` ^-_
# ///-._ _ _ _ _ _ _}^ - - - - ~ ~-- ,.-~
# /.-~
#
# Note: I created this beauty dragon using cowsay app, ~$ cowsay -f dragon "Talayan"
## Step 2: Exploring and preparing the data ----
#import the csv file
data_set <- read.csv("hw2train.csv", stringsAsFactors = FALSE)
# examine the structure of the test data frame
str(data_set)
# drop the class features,
#data_set <- data_set[,16:43]
# shuffle data
data_set <- data_set[sample(nrow(data_set)),]
# table of rightmost feature
table(data_set$Feat.43)
# recode Feat.43 as a factor
data_set$Feat.43 <- factor(data_set$Feat.43, levels = c(-1, 1),
labels = c("Negative", "Positive"))
# table or proportions with more informative labels
round(prop.table(table(data_set$Feat.43)) * 100, digits = 1)
# summarize three numeric features
summary(data_set[c("Feat.17", "Feat.18", "Feat.19")])
# create normalization function
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
# test normalization function - result should be identical
normalize(c(1, 2, 3, 4, 5))
normalize(c(10, 20, 30, 40, 50))
# normalize the homework data
data_set_n <- as.data.frame(lapply(data_set[1:42], normalize))
# confirm that normalization worked
summary(data_set_n$Feat.17)
# shuffle normalized data
#data_set_n_shuffle <- data_set_n[sample(nrow(data_set_n)),]
# create training and test data
data_set_train <- data_set_n[1:99, ]
data_set_test <- data_set_n[100:164, ]
# table of
table(data_set$Feat.43)
# create labels for training and test data
data_set_train_labels <- data_set[1:99, 43]
data_set_test_labels <- data_set[100:164, 43]
## Step 3: Training a model on the data ----
# load the "class" library
library(class)
data_set_test_pred <- knn(train = data_set_train, test = data_set_test,
cl = data_set_train_labels, k=10)
## Step 4: Evaluating model performance ----
# load the "gmodels" library
library(gmodels)
# Create the cross tabulation of predicted vs. actual
CrossTable(x = data_set_test_labels, y = data_set_test_pred,
prop.chisq=FALSE)
## Step 5: Optimize the solution
# load the "corrplot" library,
# the library to compute correlation matrix
install.packages("corrplot")
library(corrplot)
library(mlbench)
library(caret)
# compute the correlation matrix
correlation_mat <- cor(data_set_n)
# summurize the correlation matrix
print(correlation_mat)
# find attributes that are highly corrected (ideally >0.75)
highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.5)
# visualize the matrix, clustering features by correlation index.
corrplot(correlation_mat, order = "hclust")
setwd("/home/t4/Dropbox/Machine Learning/hw2")
install.packages("plyr_1.8.2", repos=NULL)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment