talayhan · August 29, 2015 14:19
diff --git a/hw2Mac.r b/hw2Mac.r
 # -------------------------------
 # Homework 2 - Machine Learning 
 # Samet Sait Talayhan
 # -------------------------------
 #      o                    / \  //\
 #       o    |\___/|      /   \//  \\
 #            /0  0  \__  /    //  | \ \    
 #           /     /  \/_/    //   |  \  \  
 #           @_^_@'/   \/_   //    |   \   \ 
 #           //_^_/     \/_ //     |    \    \
 #        ( //) |        \///      |     \     \
 #      ( / /) _|_ /   )  //       |      \     _\
 #    ( // /) '/,_ _ _/  ( ; -.    |    _ _\.-~        .-~~~^-.
 #  (( / / )) ,-{        _      `-.|.-~-.           .~         `.
 # (( // / ))  '/\      /                 ~-. _ .-~      .-~^-.  \
 # (( /// ))      `.   {            }                   /      \  \
 #  (( / ))     .----~-.\        \-'                 .~         \  `. \^-.
 #             ///.----..>        \             _ -~             `.  ^-`  ^-_
 #               ///-._ _ _ _ _ _ _}^ - - - - ~                     ~-- ,.-~
 #                                                                  /.-~
 #
 # Note: I created this beauty dragon using cowsay app, ~$ cowsay -f dragon "Talayan"

 ## Step 2: Exploring and preparing the data ---- 

 #import the csv file
 data_set <- read.csv("hw2train.csv", stringsAsFactors = FALSE)

 # examine the structure of the test data frame
 str(data_set)

 # drop the class features,
 #data_set <- data_set[,16:43]

 # shuffle data
 data_set <- data_set[sample(nrow(data_set)),]

 # table of rightmost feature
 table(data_set$Feat.43)

 # recode Feat.43 as a factor
 data_set$Feat.43 <- factor(data_set$Feat.43, levels = c(-1, 1),
                         labels = c("Negative", "Positive"))

 # table or proportions with more informative labels
 round(prop.table(table(data_set$Feat.43)) * 100, digits = 1)

 # summarize three numeric features
 summary(data_set[c("Feat.17", "Feat.18", "Feat.19")])

 # create normalization function
 normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
 }

 # test normalization function - result should be identical
 normalize(c(1, 2, 3, 4, 5))
 normalize(c(10, 20, 30, 40, 50))

 # normalize the homework data
 data_set_n <- as.data.frame(lapply(data_set[1:42], normalize))

 # confirm that normalization worked
 summary(data_set_n$Feat.17)

 # shuffle normalized data
 #data_set_n_shuffle <- data_set_n[sample(nrow(data_set_n)),]

 # create training and test data
 data_set_train <- data_set_n[1:99, ]
 data_set_test <- data_set_n[100:164, ]


 # table of
 table(data_set$Feat.43)

 # create labels for training and test data
 data_set_train_labels <- data_set[1:99, 43]
 data_set_test_labels <- data_set[100:164, 43]

 ## Step 3: Training a model on the data ----

 # load the "class" library
 library(class)

 data_set_test_pred <- knn(train = data_set_train, test = data_set_test,
                      cl = data_set_train_labels, k=10)


 ## Step 4: Evaluating model performance ----

 # load the "gmodels" library
 library(gmodels)

 # Create the cross tabulation of predicted vs. actual
 CrossTable(x = data_set_test_labels, y = data_set_test_pred,
           prop.chisq=FALSE)


 ## Step 5: Optimize the solution

 # load the "corrplot" library, 
 # the library to compute correlation matrix
 install.packages("corrplot")
 library(corrplot)
 library(mlbench)
 library(caret)

 # compute the correlation matrix
 correlation_mat <- cor(data_set_n)

 # summurize the correlation matrix
 print(correlation_mat)

 # find attributes that are highly corrected (ideally >0.75)
 highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.5)

 # visualize the matrix, clustering features by correlation index.
 corrplot(correlation_mat, order = "hclust")

 setwd("/home/t4/Dropbox/Machine Learning/hw2")
 install.packages("plyr_1.8.2", repos=NULL)
	# -------------------------------
	# Homework 2 - Machine Learning
	# Samet Sait Talayhan
	# -------------------------------
	# o / \ //\
	# o \|\___/\| / \// \\
	# /0 0 \__ / // \| \ \
	# / / \/_/ // \| \ \
	# @_^_@'/ \/_ // \| \ \
	# //_^_/ \/_ // \| \ \
	# ( //) \| \/// \| \ \
	# ( / /) _\|_ / ) // \| \ _\
	# ( // /) '/,_ _ _/ ( ; -. \| _ _\.-~ .-~~~^-.
	# (( / / )) ,-{ _ `-.\|.-~-. .~ `.
	# (( // / )) '/\ / ~-. _ .-~ .-~^-. \
	# (( /// )) `. { } / \ \
	# (( / )) .----~-.\ \-' .~ \ `. \^-.
	# ///.----..> \ _ -~ `. ^-` ^-_
	# ///-._ _ _ _ _ _ _}^ - - - - ~ ~-- ,.-~
	# /.-~
	#
	# Note: I created this beauty dragon using cowsay app, ~$ cowsay -f dragon "Talayan"

	## Step 2: Exploring and preparing the data ----

	#import the csv file
	data_set <- read.csv("hw2train.csv", stringsAsFactors = FALSE)

	# examine the structure of the test data frame
	str(data_set)

	# drop the class features,
	#data_set <- data_set[,16:43]

	# shuffle data
	data_set <- data_set[sample(nrow(data_set)),]

	# table of rightmost feature
	table(data_set$Feat.43)

	# recode Feat.43 as a factor
	data_set$Feat.43 <- factor(data_set$Feat.43, levels = c(-1, 1),
	labels = c("Negative", "Positive"))

	# table or proportions with more informative labels
	round(prop.table(table(data_set$Feat.43)) * 100, digits = 1)

	# summarize three numeric features
	summary(data_set[c("Feat.17", "Feat.18", "Feat.19")])

	# create normalization function
	normalize <- function(x) {
	return ((x - min(x)) / (max(x) - min(x)))
	}

	# test normalization function - result should be identical
	normalize(c(1, 2, 3, 4, 5))
	normalize(c(10, 20, 30, 40, 50))

	# normalize the homework data
	data_set_n <- as.data.frame(lapply(data_set[1:42], normalize))

	# confirm that normalization worked
	summary(data_set_n$Feat.17)

	# shuffle normalized data
	#data_set_n_shuffle <- data_set_n[sample(nrow(data_set_n)),]

	# create training and test data
	data_set_train <- data_set_n[1:99, ]
	data_set_test <- data_set_n[100:164, ]


	# table of
	table(data_set$Feat.43)

	# create labels for training and test data
	data_set_train_labels <- data_set[1:99, 43]
	data_set_test_labels <- data_set[100:164, 43]

	## Step 3: Training a model on the data ----

	# load the "class" library
	library(class)

	data_set_test_pred <- knn(train = data_set_train, test = data_set_test,
	cl = data_set_train_labels, k=10)


	## Step 4: Evaluating model performance ----

	# load the "gmodels" library
	library(gmodels)

	# Create the cross tabulation of predicted vs. actual
	CrossTable(x = data_set_test_labels, y = data_set_test_pred,
	prop.chisq=FALSE)


	## Step 5: Optimize the solution

	# load the "corrplot" library,
	# the library to compute correlation matrix
	install.packages("corrplot")
	library(corrplot)
	library(mlbench)
	library(caret)

	# compute the correlation matrix
	correlation_mat <- cor(data_set_n)

	# summurize the correlation matrix
	print(correlation_mat)

	# find attributes that are highly corrected (ideally >0.75)
	highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.5)

	# visualize the matrix, clustering features by correlation index.
	corrplot(correlation_mat, order = "hclust")

	setwd("/home/t4/Dropbox/Machine Learning/hw2")
	install.packages("plyr_1.8.2", repos=NULL)
No results found