Created
June 22, 2014 23:11
-
-
Save joelkr/503844e2214de02ab91e to your computer and use it in GitHub Desktop.
R data munging file for UC Irvine Cardiac Arrythmia File
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cardiac <- read.csv("arrhythmia.data", header=F, na.strings="?") | |
colnames(cardiac)[1:280] <- c("Age","Gender_Nom","Height","Weight","QRS_Dur", | |
"P-R_Int","Q-T_Int","T_Int","P_Int","QRS","T","P","QRST","J","Heart_Rate", | |
"Q_Wave","R_Wave","S_Wave","R_Prime","S_Prime","Int_Def","Rag_R_Nom", | |
"Diph_R_Nom","Rag_P_Nom","Diph_P_Nom","Rag_T_Nom","Diph_T_Nom", | |
"DII00", "DII01","DII02", "DII03", "DII04","DII05","DII06","DII07","DII08","DII09","DII10","DII11", | |
"DIII00","DIII01","DIII02", "DIII03", "DIII04","DIII05","DIII06","DIII07","DIII08","DIII09","DIII10","DIII11", | |
"AVR00","AVR01","AVR02","AVR03","AVR04","AVR05","AVR06","AVR07","AVR08","AVR09","AVR10","AVR11", | |
"AVL00","AVRL1","AVL02","AVL03","AVL04","AVL05","AVL06","AVL07","AVL08","AVL09","AVL10","AVL11", | |
"AVF00","AVF01","AVF02","AVF03","AVF04","AVF05","AVF06","AVF07","AVF08","AVF09","AVF10","AVF11", | |
"V100","V101","V102","V103","V104","V105","V106","V107","V108","V109","V110","V111", | |
"V200","V201","V202","V203","V204","V205","V206","V207","V208","V209","V210","V211", | |
"V300","V301","V302","V303","V304","V305","V306","V307","V308","V309","V310","V311", | |
"V400","V401","V402","V403","V404","V405","V406","V407","V408","V409","V410","V411", | |
"V500","V501","V502","V503","V504","V505","V506","V507","V508","V509","V510","V511", | |
"V600","V601","V602","V603","V604","V605","V606","V607","V608","V609","V610","V611", | |
"JJ_Wave","Q_Wave","R_Wave","S_Wave","R_Prime_Wave","S_Prime_Wave","P_Wave","T_Wave", | |
"QRSA","QRSTA", | |
"DII170","DII171","DII172","DII173","DII174","DII175","DII176","DII177","DII178","DII179", | |
"DIII180","DIII181","DIII182","DIII183","DIII184","DIII185","DIII186","DIII187","DIII188","DIII189", | |
"AVR190","AVR191","AVR192","AVR193","AVR194","AVR195","AVR196","AVR197","AVR198","AVR199", | |
"AVL200","AVL201","AVL202","AVL203","AVL204","AVL205","AVL206","AVL207","AVL208","AVL209", | |
"AVF210","AVF211","AVF212","AVF213","AVF214","AVF215","AVF216","AVF217","AVF218","AVF219", | |
"V1220","V1221","V1222","V1223","V1224","V1225","V1226","V1227","V1228","V1229", | |
"V2230","V2231","V2232","V2233","V2234","V2235","V2236","V2237","V2238","V2239", | |
"V3240","V3241","V3242","V3243","V3244","V3245","V3246","V3247","V3248","V3249", | |
"V4250","V4251","V4252","V4253","V4254","V4255","V4256","V4257","V4258","V4259", | |
"V5260","V5261","V5262","V5263","V5264","V5265","V5266","V5267","V5268","V5269", | |
"V6270","V6271","V6272","V6273","V6274","V6275","V6276","V6277","V6278","V6279", | |
"Class_Nom" | |
) | |
cardiac_num <- cardiac[,grep("*Nom", colnames(cardiac), invert=T)] | |
# Some columns have max and min of zero, so are probably useless. | |
# It is possible that some groups of columns are actually time plots of | |
# data that pass through zero. Need to plot. | |
#cardiac_num <- cardiac_num[, apply(cardiac_num, 2, function(x) ! all(x == 0))] | |
# Clean out NA's from numeric data | |
# This reduces from 452 to 68 cases, so must find a way to handle missing | |
# data. | |
#cardiac_num <- na.omit(cardiac_num) | |
# We need to separate training and test sets. Would like the same sample | |
# so set a seed. | |
set.seed(1234) | |
index <- 1:nrow(cardiac) | |
trainindex <- sample(index, trunc(2 * length(index)/3)) | |
cardiac_train <- cardiac[trainindex, ] | |
cardiac_test <- cardiac[-trainindex, ] | |
# Save unmodified, train, and test sets | |
write.table(cardiac, "cardiac.csv", sep=",", row.names=F) | |
save(cardiac, file="cardiac.RData", compress=TRUE) | |
write.table(cardiac_train, "cardiac_train.csv", sep=",", row.names=F) | |
write.table(cardiac_test, "cardiac_test.csv", sep=",", row.names=F) | |
# Separate train and test numeric data | |
cardiac_num_train <- cardiac_num[trainindex, ] | |
cardiac_num_test <- cardiac_num[-trainindex, ] | |
# Next we must construct a matrix of 0's and 1's with 1's at points where | |
# we have data, and 0's where we have NA's | |
# For the method in Coursera Machine Learning, we will need both | |
# train and test sets to have a matrix like this. | |
r <- rep(1, times=(nrow(cardiac_num_train)*ncol(cardiac_num_train))) | |
Rtrain <- matrix(r, nrow=nrow(cardiac_num_train), ncol=ncol(cardiac_num_train)) | |
r <- rep(1, times=(nrow(cardiac_num_test)*ncol(cardiac_num_test))) | |
Rtest <- matrix(r, nrow=nrow(cardiac_num_test), ncol=ncol(cardiac_num_test)) | |
# Replace 1's with 0's where there are NA's in cardiac data | |
Rtrain[is.na(cardiac_num_train)] <- 0 | |
Rtest[is.na(cardiac_num_test)] <- 0 | |
# Replace NA's with 0 for numeric data | |
cardiac_num_train[is.na(cardiac_num_train)] <- 0 | |
# For the method in Coursera Machine Learning, we will need both | |
# train and test sets to have a matrix like this. | |
r <- rep(1, times=(nrow(cardiac_num_train)*ncol(cardiac_num_train))) | |
Rtrain <- matrix(r, nrow=nrow(cardiac_num_train), ncol=ncol(cardiac_num_train)) | |
r <- rep(1, times=(nrow(cardiac_num_test)*ncol(cardiac_num_test))) | |
Rtest <- matrix(r, nrow=nrow(cardiac_num_test), ncol=ncol(cardiac_num_test)) | |
# Replace 1's with 0's where there are NA's in cardiac data | |
Rtrain[is.na(cardiac_num_train)] <- 0 | |
Rtest[is.na(cardiac_num_test)] <- 0 | |
# Replace NA's with 0 for numeric data | |
cardiac_num_train[is.na(cardiac_num_train)] <- 0 | |
cardiac_num_test[is.na(cardiac_num_test)] <- 0 | |
# Write numeric tables | |
write.table(cardiac_num_train, "cardiac_num_train.csv", sep=",", row.names=F) | |
write.table(cardiac_num_test, "cardiac_num_test.csv", sep=",", row.names=F) | |
# Save work in R format | |
save(cardiac_num_train, cardiac_num_test, Rtrain, Rtest, file="cardiacSetup.rda") | |
# Clear test data from environment | |
rm(cardiac, cardiac_num, cardiac_test, cardiac_num_test, trainindex, index,r) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment