library(data.table)
n.segments <- 10
seg.mean.vec <- 1:n.segments
data.per.segment <- 10
data.mean.vec <- rep(seg.mean.vec, each=data.per.segment)
n.data <- length(data.mean.vec)
n.validation.sets <- 100
n.folds.vec <- c(10, 2)
prop.valid.vec <- 1/n.folds.vec
sim.result.list <- list()
for(data.seed in 1:100){
set.seed(data.seed)
data.vec <- rnorm(n.data, data.mean.vec, 0.1)
is.valid.vec.list <- list()
for(n.folds in n.folds.vec){
uniq.folds <- 1:n.folds
n.seeds <- n.validation.sets/n.folds
split.type <- sprintf("%d-fold %d times", n.folds, n.seeds)
for(seed in 1:n.seeds){
set.seed(seed)
fold.vec <- sample(rep(uniq.folds, l=n.data))
for(valid.fold in uniq.folds){
is.valid.vec.list[[split.type]][[paste(seed, valid.fold)]] <-
fold.vec==valid.fold
}
}
}
for(prop.valid in prop.valid.vec){
split.type <- sprintf("%d%% %d times", 100*prop.valid, n.validation.sets)
prop.vec <- c(subtrain=1-prop.valid, validation=prop.valid)
for(split.i in 1:n.validation.sets){
set.seed(split.i)
is.valid.vec.list[[split.type]][[split.i]] <- binsegRcpp::random_set_vec(
n.data, prop.vec) == "validation"
}
}
loss.dt <- CJ(split.i=1:n.validation.sets, type=names(is.valid.vec.list))[, {
is.valid <- is.valid.vec.list[[type]][[split.i]]
bs.model <- binsegRcpp::binseg_normal(data.vec, is.validation.vec=is.valid)
bs.model$splits[, data.table(
segments,
validation.loss)]
}, by=.(split.i, type)]
loss.stats <- loss.dt[, .(
mean.valid.loss=mean(validation.loss)
), by=.(type, segments)]
select.each.split <- loss.dt[
, .SD[which.min(validation.loss)],
by=.(type, split.i)]
selected.times <- select.each.split[, .(
times=.N
), by=.(type, segments)]
selected.segments <- rbind(
select.each.split[, .(
selected=min(segments)
), by=.(method=paste(type, "min err, min segs"))],
selected.times[, .(
selected=segments[which.max(times)]
), by=.(method=paste(type, "min err, max times"))],
loss.stats[, .(
selected=segments[which.min(mean.valid.loss)]
), by=.(method=paste(type, "mean err, min err"))]
)
sim.result.list[[data.seed]] <- data.table(
data.seed, selected.segments, n.segments)
}
sim.result <- do.call(rbind, sim.result.list)
(sim.err <- sim.result[, .(
zero.one.loss=sum(selected != n.segments),
L1.loss=sum(abs(selected-n.segments)),
L2.loss=sum((selected-n.segments)^2)
), by=method][order(zero.one.loss)])
#> method zero.one.loss L1.loss L2.loss
#> <char> <int> <num> <num>
#> 1: 10% 100 times min err, max times 2 4 10
#> 2: 10-fold 10 times min err, max times 2 5 13
#> 3: 2-fold 50 times min err, max times 3 4 6
#> 4: 50% 100 times min err, max times 5 6 8
#> 5: 2-fold 50 times min err, min segs 7 7 7
#> 6: 50% 100 times min err, min segs 27 27 27
#> 7: 50% 100 times mean err, min err 42 128 1166
#> 8: 10% 100 times mean err, min err 45 232 3310
#> 9: 10-fold 10 times mean err, min err 46 198 2128
#> 10: 2-fold 50 times mean err, min err 47 108 898
#> 11: 10% 100 times min err, min segs 100 263 829
#> 12: 10-fold 10 times min err, min segs 100 321 1135
plot(data.vec)
The code above compares several types of cross-validation for selecting the number of segments in simulated random normal data. The table above shows various error rates which compare the selected number of segments to the true number of segments in the simulation. The best methods appears to be the ones which use min err, max times.
n.segments <- 20
seg.mean.vec <- 1:n.segments
data.per.segment <- 5
data.mean.vec <- rep(seg.mean.vec, each=data.per.segment)
n.data <- length(data.mean.vec)
n.validation.sets <- 200
prop.valid <- c(0.01, 0.05, 0.1, 0.25, 0.5)
sim.result <- data.table(data.seed=1:100)[, {
set.seed(data.seed)
data.vec <- rnorm(n.data, data.mean.vec, 0.1)
select.each.split <- CJ(split.i=1:n.validation.sets, prop.valid)[, {
set.seed(split.i)
prop.sets <- c(subtrain=1-prop.valid, validation=prop.valid)
is.valid <- binsegRcpp::random_set_vec(
n.data, prop.sets)=="validation"
bs.model <- binsegRcpp::binseg_normal(
data.vec, is.validation.vec=is.valid)
bs.model$splits[, .(selected=segments[which.min(validation.loss)])]
}, by=.(split.i, prop.valid)]
data.table(n.splits=1:n.validation.sets)[, {
select.each.split[split.i <= n.splits, .(
times=.N
),
by=.(prop.valid, selected)
][, .SD[which.max(times), .(selected)], by=prop.valid]
}, by=n.splits]
}, by=data.seed]
if(require(ggplot2)){
ggplot()+
scale_color_gradient(low="red", high="black")+
geom_line(aes(
n.splits, selected,
group=paste(data.seed, prop.valid),
color=prop.valid),
data=sim.result)+
scale_y_continuous(breaks=seq(0, 100, by=10))
}
#> Loading required package: ggplot2
accuracy.dt <- sim.result[, .(
correct=sum(selected==n.segments)
), by=.(prop.valid, n.splits)]
if(require(ggplot2)){
gg <- ggplot()+
geom_line(aes(
n.splits, correct,
group=prop.valid,
color=prop.valid),
size=2,
data=accuracy.dt)+
scale_y_continuous("number of correctly chosen data sets")+
scale_color_gradient(low="red", high="black")
if(require(directlabels)){
direct.label(gg, "right.polygons")
}else{
gg
}
}
#> Loading required package: directlabels
The plot above suggests that 100 validation sets is sufficient.