duttashi · January 1, 2022 21:12
diff --git a/data_split.R b/data_split.R
 # Method 1: Easiest and does not require any library
 data(mtcars)

 ## 75% of the sample size
 smp_size <- floor(0.75 * nrow(mtcars))

 ## set the seed to make your partition reproducible
 set.seed(123)
 train_ind <- sample(seq_len(nrow(mtcars)), size = smp_size)

 train <- mtcars[train_ind, ]
 test <- mtcars[-train_ind, ]

 # Method 2: 
 # This gist is derived from a post that I had published on wordpress few years ago. 
 # reference: https://edumine.wordpress.com/2015/04/06/splitting-a-data-frame-into-training-and-testing-sets-in-r/
 # Reference: https://stackoverflow.com/questions/17200114/how-to-split-data-into-training-testing-sets-using-sample-function

 # Load data
 iris.data<- read.csv(url("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"),
                          header = FALSE, sep = ",",
                          col.names = c("sepal.length","sepal.width","petal.length","petal.width",    
                                        "species")
                          )
 # divide the data into train and test
 # reference: https://edumine.wordpress.com/2015/04/06/splitting-a-data-frame-into-training-and-testing-sets-in-r/

 # set a seed value for reproducibility
 set.seed(1208)
 library(caTools)

 # Make sure that the data set is shuffled and that you have the same ratio between species in your training and test sets. You use the sample() function to take a sample with a size that is set as the number of rows of the Iris data set which is 150.
 # Also you will create a new vector variable in the Iris dataset that will have the TRUE and FALSE values basis on which you will later split the dataset into training and test.
 # By using the sample.split() you are actually creating a vector with two values TRUE and FALSE. By setting the SplitRatio to 0.7, you are splitting the original Iris dataset of 150 rows to 70% training and 30% testing data.

 iris.data$spl<- sample.split(iris.data, SplitRatio = 0.7) 

 # where spl== TRUE means to add only those rows that have value true for spl in the training dataframe
 iris.data.train<- subset(iris.data, iris.data$spl==TRUE) 
 iris.data.test<- subset(iris.data, iris.data$spl==FALSE)

 # drop the spl column from both training and test set
 iris.data.train$spl<- NULL
 iris.data.test$spl<- NULL
	# Method 1: Easiest and does not require any library
	data(mtcars)

	## 75% of the sample size
	smp_size <- floor(0.75 * nrow(mtcars))

	## set the seed to make your partition reproducible
	set.seed(123)
	train_ind <- sample(seq_len(nrow(mtcars)), size = smp_size)

	train <- mtcars[train_ind, ]
	test <- mtcars[-train_ind, ]

	# Method 2:
	# This gist is derived from a post that I had published on wordpress few years ago.
	# reference: https://edumine.wordpress.com/2015/04/06/splitting-a-data-frame-into-training-and-testing-sets-in-r/
	# Reference: https://stackoverflow.com/questions/17200114/how-to-split-data-into-training-testing-sets-using-sample-function

	# Load data
	iris.data<- read.csv(url("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"),
	header = FALSE, sep = ",",
	col.names = c("sepal.length","sepal.width","petal.length","petal.width",
	"species")
	)
	# divide the data into train and test
	# reference: https://edumine.wordpress.com/2015/04/06/splitting-a-data-frame-into-training-and-testing-sets-in-r/

	# set a seed value for reproducibility
	set.seed(1208)
	library(caTools)

	# Make sure that the data set is shuffled and that you have the same ratio between species in your training and test sets. You use the sample() function to take a sample with a size that is set as the number of rows of the Iris data set which is 150.
	# Also you will create a new vector variable in the Iris dataset that will have the TRUE and FALSE values basis on which you will later split the dataset into training and test.
	# By using the sample.split() you are actually creating a vector with two values TRUE and FALSE. By setting the SplitRatio to 0.7, you are splitting the original Iris dataset of 150 rows to 70% training and 30% testing data.

	iris.data$spl<- sample.split(iris.data, SplitRatio = 0.7)

	# where spl== TRUE means to add only those rows that have value true for spl in the training dataframe
	iris.data.train<- subset(iris.data, iris.data$spl==TRUE)
	iris.data.test<- subset(iris.data, iris.data$spl==FALSE)

	# drop the spl column from both training and test set
	iris.data.train$spl<- NULL
	iris.data.test$spl<- NULL