Last active
October 13, 2015 17:28
-
-
Save mattbaggott/4230765 to your computer and use it in GitHub Desktop.
short comparison of methods of finding breakpoints in a specific dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
####################### | |
# | |
# short investigation of id'ing breakpoints in data | |
# | |
# [email protected] | |
library("bcp") | |
library("ggplot2") | |
library("changepoint") | |
library("reshape") | |
# should investigate newer library("cpm") | |
####################### | |
# | |
# load data | |
data = c(746,743,735,735,728,717,709,691,746,738,732,728,722,782,781,778,776, | |
775,766,763,763,761,758,750,744,737,731,723,712,706,767,764,755,746,746,744,738,735, | |
726,718,717,708,694,759,756,749,744,744,740,738,738,737,728,711,705,699,753,749, | |
744,743,735,731,723,723,718,709,703,769,764,756,750,743,738,734,725,720,712,708, | |
756,749,741,737,737,731,705,705,685,779,776,775,782,785,781,776,775,769,769,759, | |
759,756,752,747,738,734,725,712,767,763,758,755,753,752,744,738,729,732,725,720, | |
709,702,700,779,766,763,758,755,753,753,747,744,737,729,723,715,706,705,696,782, | |
778,772,769,772,775,775,775,772,772,767,750,755,753,752,749,752,752,749,743,743, | |
732,732,728,725,720,715,711,703,696) | |
plot(data) # take a look | |
data1 <- data.frame(vals=data) # make a data frame | |
data1$time <- row(data1) # make a 'time' var | |
####################### | |
# | |
# many change point approaches are looking for changes in mean or variance | |
# which we don't have here, and that is why the approach is failing, I think | |
# with the increased data in the newer example, it is estimating the mean better. | |
# One way around that would be to use sequential differences | |
# or some other derived value, here we use diff() | |
data1$diffs <- c(0,diff(data1$vals)) # padding the start with a zero since there is no diff there | |
# and we want equal lengths | |
####################### | |
# | |
# a simple answer might be something like this | |
data1$bigdiffs <- c(0,diff(data1$vals))>30 # define break as a change of > 30 points | |
####################### | |
# or we make classifiers using the differences between points | |
####################### | |
# BCP method on diffs | |
data1$bcpfit <- bcp(data1$diffs)$posterior.prob #from bcp | |
####################### | |
# PELT method on diffs | |
cpts1=cpt.var(data1$diffs,method="PELT")@cpts #from changepoint | |
cpts2=cpt.meanvar(data1$diffs,method="PELT")@cpts #from changepoint | |
data1$cptfit1 <- 0 # populate the variable | |
data1[cpts1,]$cptfit1 <- 1 # assign based on cpts1 | |
data1$cptfit2 <- 0 # populate the variable | |
data1[cpts2,]$cptfit2 <- 1 # assign based on cpts1 | |
####################### | |
# reorder columns for convenience: time should be first | |
data1 <- data1[,c("time","vals","diffs","bigdiffs","cptfit1","cptfit2","bcpfit")] | |
####################### | |
# plot results | |
data.m <- melt(data1,id.var=c("time")) #from reshape | |
head(data.m) | |
summary(data.m) | |
ggplot(data.m,aes(x=time, y=value, colour=variable))+geom_line()+facet_grid(variable~., scales="free") | |
####################### | |
# | |
# RESULTS: it looks like the simple bigdiffs approach outperforms the others | |
# maybe simpler is better | |
# | |
# HOWEVER, if we wanted to hand label a bunch of data, we could use it to train a classifier | |
# that could combine these approaches |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment