-
-
Save ndamulelonemakh/0462550a869cb5735b2dbc3c146e6559 to your computer and use it in GitHub Desktop.
splitting a given dataset into train and test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Method 1: Easiest and does not require any library | |
data(mtcars) | |
## 75% of the sample size | |
smp_size <- floor(0.75 * nrow(mtcars)) | |
## set the seed to make your partition reproducible | |
set.seed(123) | |
train_ind <- sample(seq_len(nrow(mtcars)), size = smp_size) | |
train <- mtcars[train_ind, ] | |
test <- mtcars[-train_ind, ] | |
# Method 2: | |
# This gist is derived from a post that I had published on wordpress few years ago. | |
# reference: https://edumine.wordpress.com/2015/04/06/splitting-a-data-frame-into-training-and-testing-sets-in-r/ | |
# Reference: https://stackoverflow.com/questions/17200114/how-to-split-data-into-training-testing-sets-using-sample-function | |
# Load data | |
iris.data<- read.csv(url("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"), | |
header = FALSE, sep = ",", | |
col.names = c("sepal.length","sepal.width","petal.length","petal.width", | |
"species") | |
) | |
# divide the data into train and test | |
# reference: https://edumine.wordpress.com/2015/04/06/splitting-a-data-frame-into-training-and-testing-sets-in-r/ | |
# set a seed value for reproducibility | |
set.seed(1208) | |
library(caTools) | |
# Make sure that the data set is shuffled and that you have the same ratio between species in your training and test sets. You use the sample() function to take a sample with a size that is set as the number of rows of the Iris data set which is 150. | |
# Also you will create a new vector variable in the Iris dataset that will have the TRUE and FALSE values basis on which you will later split the dataset into training and test. | |
# By using the sample.split() you are actually creating a vector with two values TRUE and FALSE. By setting the SplitRatio to 0.7, you are splitting the original Iris dataset of 150 rows to 70% training and 30% testing data. | |
iris.data$spl<- sample.split(iris.data, SplitRatio = 0.7) | |
# where spl== TRUE means to add only those rows that have value true for spl in the training dataframe | |
iris.data.train<- subset(iris.data, iris.data$spl==TRUE) | |
iris.data.test<- subset(iris.data, iris.data$spl==FALSE) | |
# drop the spl column from both training and test set | |
iris.data.train$spl<- NULL | |
iris.data.test$spl<- NULL |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment