mattbaggott · October 13, 2015 17:28
diff --git a/predictingbreaks.R b/predictingbreaks.R
 #######################
 #
 # short investigation of id'ing breakpoints in data
 #
 # [email protected]

 library("bcp")
 library("ggplot2")
 library("changepoint")
 library("reshape")
 # should investigate newer library("cpm")

 #######################
 #
 #  load data

 data = c(746,743,735,735,728,717,709,691,746,738,732,728,722,782,781,778,776,
 775,766,763,763,761,758,750,744,737,731,723,712,706,767,764,755,746,746,744,738,735,
 726,718,717,708,694,759,756,749,744,744,740,738,738,737,728,711,705,699,753,749,
 744,743,735,731,723,723,718,709,703,769,764,756,750,743,738,734,725,720,712,708,
 756,749,741,737,737,731,705,705,685,779,776,775,782,785,781,776,775,769,769,759,
 759,756,752,747,738,734,725,712,767,763,758,755,753,752,744,738,729,732,725,720,
 709,702,700,779,766,763,758,755,753,753,747,744,737,729,723,715,706,705,696,782,
 778,772,769,772,775,775,775,772,772,767,750,755,753,752,749,752,752,749,743,743,
 732,732,728,725,720,715,711,703,696)

 plot(data) # take a look

 data1 <- data.frame(vals=data)  # make a data frame

 data1$time <- row(data1)        # make a 'time' var

 #######################
 #  
 #  many change point approaches are looking for changes in mean or variance
 #  which we don't have here, and that is why the approach is failing, I think
 #  with the increased data in the newer example, it is estimating the mean better.
 #  One way around that would be to use sequential differences
 #  or some other derived value, here we use diff()

 data1$diffs <- c(0,diff(data1$vals)) # padding the start with a zero since there is no diff there
                                     # and we want equal lengths

 #######################
 #
 #  a simple answer might be something like this

 data1$bigdiffs <- c(0,diff(data1$vals))>30  # define break as a change of > 30 points

 #######################
 # or we make classifiers using the differences between points

 #######################
 # BCP method on diffs

 data1$bcpfit <- bcp(data1$diffs)$posterior.prob #from bcp


 #######################
 # PELT method on diffs

 cpts1=cpt.var(data1$diffs,method="PELT")@cpts #from changepoint
 cpts2=cpt.meanvar(data1$diffs,method="PELT")@cpts #from changepoint

 data1$cptfit1 <- 0          # populate the variable
 data1[cpts1,]$cptfit1 <- 1  # assign based on cpts1

 data1$cptfit2 <- 0          # populate the variable
 data1[cpts2,]$cptfit2 <- 1  # assign based on cpts1

 #######################
 # reorder columns for convenience: time should be first

 data1 <- data1[,c("time","vals","diffs","bigdiffs","cptfit1","cptfit2","bcpfit")]

 #######################
 # plot results

 data.m <- melt(data1,id.var=c("time")) #from reshape
 head(data.m)
 summary(data.m)
 ggplot(data.m,aes(x=time, y=value, colour=variable))+geom_line()+facet_grid(variable~., scales="free")

 #######################
 #
 # RESULTS: it looks like the simple bigdiffs approach outperforms the others
 #          maybe simpler is better
 #
 # HOWEVER, if we wanted to hand label a bunch of data, we could use it to train a classifier 
 #          that could combine these approaches
	#######################
	#
	# short investigation of id'ing breakpoints in data
	#
	# [email protected]

	library("bcp")
	library("ggplot2")
	library("changepoint")
	library("reshape")
	# should investigate newer library("cpm")

	#######################
	#
	# load data

	data = c(746,743,735,735,728,717,709,691,746,738,732,728,722,782,781,778,776,
	775,766,763,763,761,758,750,744,737,731,723,712,706,767,764,755,746,746,744,738,735,
	726,718,717,708,694,759,756,749,744,744,740,738,738,737,728,711,705,699,753,749,
	744,743,735,731,723,723,718,709,703,769,764,756,750,743,738,734,725,720,712,708,
	756,749,741,737,737,731,705,705,685,779,776,775,782,785,781,776,775,769,769,759,
	759,756,752,747,738,734,725,712,767,763,758,755,753,752,744,738,729,732,725,720,
	709,702,700,779,766,763,758,755,753,753,747,744,737,729,723,715,706,705,696,782,
	778,772,769,772,775,775,775,772,772,767,750,755,753,752,749,752,752,749,743,743,
	732,732,728,725,720,715,711,703,696)

	plot(data) # take a look

	data1 <- data.frame(vals=data) # make a data frame

	data1$time <- row(data1) # make a 'time' var

	#######################
	#
	# many change point approaches are looking for changes in mean or variance
	# which we don't have here, and that is why the approach is failing, I think
	# with the increased data in the newer example, it is estimating the mean better.
	# One way around that would be to use sequential differences
	# or some other derived value, here we use diff()

	data1$diffs <- c(0,diff(data1$vals)) # padding the start with a zero since there is no diff there
	# and we want equal lengths

	#######################
	#
	# a simple answer might be something like this

	data1$bigdiffs <- c(0,diff(data1$vals))>30 # define break as a change of > 30 points

	#######################
	# or we make classifiers using the differences between points

	#######################
	# BCP method on diffs

	data1$bcpfit <- bcp(data1$diffs)$posterior.prob #from bcp


	#######################
	# PELT method on diffs

	cpts1=cpt.var(data1$diffs,method="PELT")@cpts #from changepoint
	cpts2=cpt.meanvar(data1$diffs,method="PELT")@cpts #from changepoint

	data1$cptfit1 <- 0 # populate the variable
	data1[cpts1,]$cptfit1 <- 1 # assign based on cpts1

	data1$cptfit2 <- 0 # populate the variable
	data1[cpts2,]$cptfit2 <- 1 # assign based on cpts1

	#######################
	# reorder columns for convenience: time should be first

	data1 <- data1[,c("time","vals","diffs","bigdiffs","cptfit1","cptfit2","bcpfit")]

	#######################
	# plot results

	data.m <- melt(data1,id.var=c("time")) #from reshape
	head(data.m)
	summary(data.m)
	ggplot(data.m,aes(x=time, y=value, colour=variable))+geom_line()+facet_grid(variable~., scales="free")

	#######################
	#
	# RESULTS: it looks like the simple bigdiffs approach outperforms the others
	# maybe simpler is better
	#
	# HOWEVER, if we wanted to hand label a bunch of data, we could use it to train a classifier
	# that could combine these approaches