Skip to content

Instantly share code, notes, and snippets.

@dmarx
Last active March 27, 2016 01:52
Show Gist options
  • Save dmarx/8f6edf0e57b79acc4b97 to your computer and use it in GitHub Desktop.
Save dmarx/8f6edf0e57b79acc4b97 to your computer and use it in GitHub Desktop.
spline model classification to phoneme data from ESLII
set.seed(123)
library(ElemStatLearn)
library(reshape2)
library(caret)
dim(phoneme) #4509 x 258
dat = phoneme[,1:256]
g = phoneme$g
n=nrow(dat)
train_ix = sample(n, n*.8)
X = dat[train_ix,]
Y = g[train_ix]
colnames(X) = 1:256
test = dat[-train_ix,]
table(Y)
# aa ao dcl iy sh
# 563 827 590 939 688
# Not a perfectly balanced dataset, but shouldn't
# be a huge problem. I can implement cross-validation
# or more balanced sampling later.
# let's try plotting 10 examples of "aa" data against "ao" data
# as an experiment. They do 15 in ESLII, but I downsampled to 10
# because I found 15 to be very messy.
aa = X[Y=="aa",]
ao = X[Y=="ao",]
sh = X[Y=="sh",]
iy = X[Y=="iy",]
dcl = X[Y=="dcl",]
plot(0,0, xlim=c(0,260), ylim=c(0,25),
xlab="Log-periodogram",
ylab="Frequency",
main="10 Random Samples from 2 phonemes"
)
for(i in 1:10){
lines(1:256, aa[i,], col="green")
lines(1:256, ao[i,], col="red")
}
#######################################
# Sanity check a spline fit
plot(0,0, xlim=c(0,260), ylim=c(0,25),
xlab="Log-periodogram",
ylab="Frequency",
main="Plot of ALL 'AA' data"
)
for(i in 1:length(aa)){
lines(1:256, aa[i,], col="green")
}
aa_melt = melt(aa)
spl_aa = smooth.spline(aa_melt, nknots=256)
lines(y~x, predict(spl_aa, 1:256))
#####################################################
aa_melt = melt(aa)
ao_melt = melt(ao)
sh_melt =melt(sh)
iy_melt =melt(iy)
dcl_melt =melt(dcl)
# Modularizing this to allow me to tune the number of knots later
train_spline_models=function(n=256){
spl_aa = smooth.spline(aa_melt, nknots=n)
spl_ao = smooth.spline(ao_melt, nknots=n)
spl_sh = smooth.spline(sh_melt, nknots=n)
spl_iy = smooth.spline(iy_melt, nknots=n)
spl_dcl = smooth.spline(dcl_melt, nknots=n)
spline_models = list(
spl_aa,
spl_ao,
spl_sh,
spl_iy,
spl_dcl
)
lapply(spline_models, function(mod) predict(mod, 1:256)$y)
}
spline_fits = train_spline_models()
spline_classify = function(x, model=spline_fits){
# x is a 256-d vector to be classified as one of 5 sounds.
MSE = sapply(model, function(v) mean( (v-x)^2) )
classify = c('aa','ao','sh','iy','dcl')
classify[which.min(MSE)]
}
# apply classifier over each record of test set, build confusion matrix.
spl_mod1_pred = apply(test, 1, spline_classify)
confusionMatrix(spl_mod1_pred, g[-train_ix])
#############################################################
# Investigate how accuracy changes with number of knots. Results will be
# biased to the current training/test set split, but I can fix that later.
acc=c()
for(nk in seq(10,250,10)){
spline_fits = train_spline_models(n=nk)
spl_mod_pred = apply(test, 1, function(x) spline_classify(x, model=spline_fits))
acc=c(acc, sum(spl_mod_pred==g[-train_ix])/nrow(test))
}
plot(seq(10,250,10), acc, type='l')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment