dmarx · March 27, 2016 01:52
diff --git a/math504_final_prob3a.r b/math504_final_prob3a.r
 set.seed(123)
 library(ElemStatLearn)
 library(reshape2)
 library(caret)

 dim(phoneme) #4509 x 258

 dat = phoneme[,1:256]
 g = phoneme$g
 n=nrow(dat)
 train_ix = sample(n, n*.8)
 X = dat[train_ix,]
 Y = g[train_ix]
 colnames(X) = 1:256
 test = dat[-train_ix,]

 table(Y)
 #   aa  ao dcl  iy  sh 
 #  563 827 590 939 688 
 # Not a perfectly balanced dataset, but shouldn't
 # be a huge problem. I can implement cross-validation
 # or more balanced sampling later.

 # let's try plotting 10 examples of "aa" data against "ao" data
 # as an experiment. They do 15 in ESLII, but I downsampled to 10
 # because I found 15 to be very messy.

 aa = X[Y=="aa",]
 ao = X[Y=="ao",]
 sh = X[Y=="sh",]
 iy = X[Y=="iy",]
 dcl = X[Y=="dcl",]

 plot(0,0, xlim=c(0,260), ylim=c(0,25),
     xlab="Log-periodogram",
     ylab="Frequency",
     main="10 Random Samples from 2 phonemes"
     )
 for(i in 1:10){
  lines(1:256, aa[i,], col="green")
  lines(1:256, ao[i,], col="red")
 }


 #######################################

 # Sanity check a spline fit

 plot(0,0, xlim=c(0,260), ylim=c(0,25),
     xlab="Log-periodogram",
     ylab="Frequency",
     main="Plot of ALL 'AA' data"
 )
 for(i in 1:length(aa)){
  lines(1:256, aa[i,], col="green")
 }

 aa_melt = melt(aa)
 spl_aa = smooth.spline(aa_melt, nknots=256)
 lines(y~x, predict(spl_aa, 1:256))

 #####################################################

 aa_melt = melt(aa)
 ao_melt = melt(ao)
 sh_melt =melt(sh)
 iy_melt =melt(iy)
 dcl_melt =melt(dcl)

 # Modularizing this to allow me to tune the number of knots later 

 train_spline_models=function(n=256){
  spl_aa = smooth.spline(aa_melt, nknots=n)
  spl_ao = smooth.spline(ao_melt, nknots=n)
  spl_sh = smooth.spline(sh_melt, nknots=n)
  spl_iy = smooth.spline(iy_melt, nknots=n)
  spl_dcl = smooth.spline(dcl_melt, nknots=n)
  
  spline_models = list(
    spl_aa,
    spl_ao,
    spl_sh,
    spl_iy,
    spl_dcl
  )
  lapply(spline_models, function(mod) predict(mod, 1:256)$y)
 }

 spline_fits = train_spline_models()

 spline_classify = function(x, model=spline_fits){
  # x is a 256-d vector to be classified as one of 5 sounds.  
  MSE = sapply(model, function(v) mean( (v-x)^2) )
  classify = c('aa','ao','sh','iy','dcl')
  classify[which.min(MSE)]
 }

 # apply classifier over each record of test set, build confusion matrix.

 spl_mod1_pred = apply(test, 1, spline_classify)

 confusionMatrix(spl_mod1_pred, g[-train_ix])

 #############################################################

 # Investigate how accuracy changes with number of knots. Results will be
 # biased to the current training/test set split, but I can fix that later.
 acc=c()
 for(nk in seq(10,250,10)){
  spline_fits = train_spline_models(n=nk) 
  spl_mod_pred = apply(test, 1, function(x) spline_classify(x, model=spline_fits))
  acc=c(acc, sum(spl_mod_pred==g[-train_ix])/nrow(test))
 }

 plot(seq(10,250,10), acc, type='l')
	set.seed(123)
	library(ElemStatLearn)
	library(reshape2)
	library(caret)

	dim(phoneme) #4509 x 258

	dat = phoneme[,1:256]
	g = phoneme$g
	n=nrow(dat)
	train_ix = sample(n, n*.8)
	X = dat[train_ix,]
	Y = g[train_ix]
	colnames(X) = 1:256
	test = dat[-train_ix,]

	table(Y)
	# aa ao dcl iy sh
	# 563 827 590 939 688
	# Not a perfectly balanced dataset, but shouldn't
	# be a huge problem. I can implement cross-validation
	# or more balanced sampling later.

	# let's try plotting 10 examples of "aa" data against "ao" data
	# as an experiment. They do 15 in ESLII, but I downsampled to 10
	# because I found 15 to be very messy.

	aa = X[Y=="aa",]
	ao = X[Y=="ao",]
	sh = X[Y=="sh",]
	iy = X[Y=="iy",]
	dcl = X[Y=="dcl",]

	plot(0,0, xlim=c(0,260), ylim=c(0,25),
	xlab="Log-periodogram",
	ylab="Frequency",
	main="10 Random Samples from 2 phonemes"
	)
	for(i in 1:10){
	lines(1:256, aa[i,], col="green")
	lines(1:256, ao[i,], col="red")
	}


	#######################################

	# Sanity check a spline fit

	plot(0,0, xlim=c(0,260), ylim=c(0,25),
	xlab="Log-periodogram",
	ylab="Frequency",
	main="Plot of ALL 'AA' data"
	)
	for(i in 1:length(aa)){
	lines(1:256, aa[i,], col="green")
	}

	aa_melt = melt(aa)
	spl_aa = smooth.spline(aa_melt, nknots=256)
	lines(y~x, predict(spl_aa, 1:256))

	#####################################################

	aa_melt = melt(aa)
	ao_melt = melt(ao)
	sh_melt =melt(sh)
	iy_melt =melt(iy)
	dcl_melt =melt(dcl)

	# Modularizing this to allow me to tune the number of knots later

	train_spline_models=function(n=256){
	spl_aa = smooth.spline(aa_melt, nknots=n)
	spl_ao = smooth.spline(ao_melt, nknots=n)
	spl_sh = smooth.spline(sh_melt, nknots=n)
	spl_iy = smooth.spline(iy_melt, nknots=n)
	spl_dcl = smooth.spline(dcl_melt, nknots=n)

	spline_models = list(
	spl_aa,
	spl_ao,
	spl_sh,
	spl_iy,
	spl_dcl
	)
	lapply(spline_models, function(mod) predict(mod, 1:256)$y)
	}

	spline_fits = train_spline_models()

	spline_classify = function(x, model=spline_fits){
	# x is a 256-d vector to be classified as one of 5 sounds.
	MSE = sapply(model, function(v) mean( (v-x)^2) )
	classify = c('aa','ao','sh','iy','dcl')
	classify[which.min(MSE)]
	}

	# apply classifier over each record of test set, build confusion matrix.

	spl_mod1_pred = apply(test, 1, spline_classify)

	confusionMatrix(spl_mod1_pred, g[-train_ix])

	#############################################################

	# Investigate how accuracy changes with number of knots. Results will be
	# biased to the current training/test set split, but I can fix that later.
	acc=c()
	for(nk in seq(10,250,10)){
	spline_fits = train_spline_models(n=nk)
	spl_mod_pred = apply(test, 1, function(x) spline_classify(x, model=spline_fits))
	acc=c(acc, sum(spl_mod_pred==g[-train_ix])/nrow(test))
	}

	plot(seq(10,250,10), acc, type='l')