Segmentation

Three clusters, one predictive law

Simulation

library(glmtree)
data = generateData(n = 1000, scenario = "no tree", visualize = TRUE)

int_train = sample.int(n = 1000, size = 0.2*1000)

test = data[-int_train,]
data = data[int_train,]

PCA

library(FactoMineR)
mixed = PCA(data[,c("x1","x2")])

data$pca1 = predict(mixed, data)$coord[,1]
data$pca2 = predict(mixed, data)$coord[,2]
test$pca1 = predict(mixed, test)$coord[,1]
test$pca2 = predict(mixed, test)$coord[,2]

data$cluster = ifelse(data$pca1 > 1, 1, ifelse(data$pca1 > 0, 2, 3))
test$cluster = ifelse(test$pca1 > 1, 1, ifelse(test$pca1 > 0, 2, 3))

pred = matrix(0, nrow = 0.2*1000, ncol = 1)

for (j in 1:3) {
  modele = glm(y ~ x1 + x2, data = data[data$cluster==j,], family=binomial(link = "logit"))
  pred[test$cluster==j] = predict(modele, test[test$cluster==j,], type="response")
}

normalizedGini(test$y,pred)
## [1] 0.6604591
plot(mixed, choix = 'ind', label = "none")

MOB

if (require(partykit, quietly = TRUE)) {
  mob_data = partykit::glmtree(formula = y ~ x1 + x2 | x1 + x2, data = data, family = binomial)
  plot(mob_data)
  normalizedGini(test$y, predict(mob_data,test))
}
## 
## Attaching package: 'partykit'
## The following object is masked from 'package:glmtree':
## 
##     glmtree

## [1] 0.6793697

glmtree approach

tree = glmtree::glmtree(x = data[,c("x1", "x2")], y = data$y)
## The bic criterion for iteration 1 is 0
## The bic criterion for iteration 2 is -225.650543595755
## The bic criterion for iteration 3 is -225.650543595755
## The bic criterion for iteration 4 is -225.650543595755
## The bic criterion for iteration 5 is -225.650543595755
## The bic criterion for iteration 6 is -225.650543595755
## The bic criterion for iteration 7 is -225.650543595755
## The bic criterion for iteration 8 is -225.650543595755
## The bic criterion for iteration 9 is -225.650543595755
## The bic criterion for iteration 10 is -225.650543595755
## The bic criterion for iteration 11 is -225.650543595755
## The bic criterion for iteration 12 is -225.650543595755
## The bic criterion for iteration 13 is -225.650543595755
## The bic criterion for iteration 14 is -225.650543595755
## The bic criterion for iteration 15 is -225.650543595755
## The bic criterion for iteration 16 is -225.650543595755
## The bic criterion for iteration 17 is -225.650543595755
## The bic criterion for iteration 18 is -225.650543595755
## The bic criterion for iteration 19 is -225.650543595755
## The bic criterion for iteration 20 is -225.650543595755
## The bic criterion for iteration 21 is -225.650543595755
## The bic criterion for iteration 22 is -225.650543595755
## The bic criterion for iteration 23 is -225.650543595755
## The bic criterion for iteration 24 is -225.650543595755
## The bic criterion for iteration 25 is -225.650543595755
## The bic criterion for iteration 26 is -225.650543595755
## The bic criterion for iteration 27 is -225.650543595755
## The bic criterion for iteration 28 is -225.650543595755
## The bic criterion for iteration 29 is -225.650543595755
## The bic criterion for iteration 30 is -225.650543595755
## The bic criterion for iteration 31 is -225.650543595755
## The bic criterion for iteration 32 is -225.650543595755
## The bic criterion for iteration 33 is -225.650543595755
## The bic criterion for iteration 34 is -225.650543595755
## The bic criterion for iteration 35 is -225.650543595755
## The bic criterion for iteration 36 is -225.650543595755
## The bic criterion for iteration 37 is -225.650543595755
## The bic criterion for iteration 38 is -225.650543595755
## The bic criterion for iteration 39 is -225.650543595755
## The bic criterion for iteration 40 is -225.650543595755
## The bic criterion for iteration 41 is -225.650543595755
## The bic criterion for iteration 42 is -225.650543595755
## The bic criterion for iteration 43 is -225.650543595755
## The bic criterion for iteration 44 is -225.650543595755
## The bic criterion for iteration 45 is -225.650543595755
## The bic criterion for iteration 46 is -225.650543595755
## The bic criterion for iteration 47 is -225.650543595755
## The bic criterion for iteration 48 is -225.650543595755
## The bic criterion for iteration 49 is -225.650543595755
## The bic criterion for iteration 50 is -225.650543595755
## The bic criterion for iteration 51 is -225.650543595755
## The bic criterion for iteration 52 is -225.650543595755
## The bic criterion for iteration 53 is -225.650543595755
## The bic criterion for iteration 54 is -225.650543595755
## The bic criterion for iteration 55 is -225.650543595755
## The bic criterion for iteration 56 is -225.650543595755
## The bic criterion for iteration 57 is -225.650543595755
## The bic criterion for iteration 58 is -225.650543595755
## The bic criterion for iteration 59 is -225.650543595755
## The bic criterion for iteration 60 is -225.650543595755
## The bic criterion for iteration 61 is -225.650543595755
## The bic criterion for iteration 62 is -225.650543595755
## The bic criterion for iteration 63 is -225.650543595755
## The bic criterion for iteration 64 is -225.650543595755
## The bic criterion for iteration 65 is -225.650543595755
## The bic criterion for iteration 66 is -225.650543595755
## The bic criterion for iteration 67 is -225.650543595755
## The bic criterion for iteration 68 is -225.650543595755
## The bic criterion for iteration 69 is -225.650543595755
## The bic criterion for iteration 70 is -225.650543595755
## The bic criterion for iteration 71 is -225.650543595755
## The bic criterion for iteration 72 is -225.650543595755
## The bic criterion for iteration 73 is -225.650543595755
## The bic criterion for iteration 74 is -225.650543595755
## The bic criterion for iteration 75 is -225.650543595755
## The bic criterion for iteration 76 is -225.650543595755
## The bic criterion for iteration 77 is -225.650543595755
## The bic criterion for iteration 78 is -225.650543595755
## The bic criterion for iteration 79 is -225.650543595755
## The bic criterion for iteration 80 is -225.650543595755
## The bic criterion for iteration 81 is -225.650543595755
## The bic criterion for iteration 82 is -225.650543595755
## The bic criterion for iteration 83 is -225.650543595755
plot(unlist(tree@performance$criterionEvolution), type="l")

data$c_map <- factor(apply(predict(tree@best.tree$tree,data,type="prob"),1,function(p) names(which.max(p))))
test$c_map <- factor(apply(predict(tree@best.tree$tree,data,type="prob"),1,function(p) names(which.max(p))))

table(data$c_map)
## 
##   4 
## 200
plot(data[,1],data[,2],pch=2+data[,3],col=as.numeric(data$c_map),xlab="First coordinate",ylab="Second coordinate")

plot(tree@best.tree$tree)

pred = matrix(0, nrow = 0.2*1000, ncol = 1)

for (j in levels(data$c_map)) {
  modele = glm(y ~ x1 + x2, data = data[data$c_map==j,], family=binomial(link = "logit"))
  pred[test$c_map==j] = predict(modele, test[test$c_map==j,], type="response")
}

normalizedGini(test$y,pred)
## [1] 0.6793697

One “cluster”, three predictive laws

Simulation

data = generateData(n = 1000, scenario = "tree", visualize = TRUE)

int_train = sample.int(n = 1000, size = 0.2*1000)

test = data[-int_train,]
data = data[int_train,]

PCA

mixed = FAMD(data[,c("x1","x2","x3")])

dim_famd = predict(mixed,test)$coord[,"Dim 1"] < 0

pred = matrix(0, nrow = 0.2*1000, ncol = 1)

for (j in c(TRUE,FALSE)) {
  modele = glm(y ~ x1 + x2 + x3, data = data[dim_famd==j,], family=binomial(link = "logit"))
  pred[dim_famd==j] = predict(modele, test[dim_famd==j,], type="response")
}

normalizedGini(test$y,pred)
## [1] 0.1205429

MOB

if (require(partykit, quietly = TRUE)) {
  mob_data = partykit::glmtree(formula = y ~ x1 + x2 +x3 | x1 + x2 + x3, data = data, family = binomial)
  plot(mob_data)
  normalizedGini(test$y, predict(mob_data,test))
}

## [1] 0.4322791

glmtree approach

tree = glmtree::glmtree(x = data[,c("x1", "x2", "x3")], y = data$y)
## The bic criterion for iteration 1 is 0
## The bic criterion for iteration 2 is -305.907700290713
## The bic criterion for iteration 3 is -305.907700290713
## The bic criterion for iteration 4 is -305.907700290713
## The bic criterion for iteration 5 is -305.907700290713
## The bic criterion for iteration 6 is -305.907700290713
## The bic criterion for iteration 7 is -279.110033109741
## The bic criterion for iteration 8 is -279.110033109741
## The bic criterion for iteration 9 is -279.110033109741
## The bic criterion for iteration 10 is -279.110033109741
## The bic criterion for iteration 11 is -279.110033109741
## The bic criterion for iteration 12 is -279.110033109741
## The bic criterion for iteration 13 is -279.110033109741
## The bic criterion for iteration 14 is -279.110033109741
## The bic criterion for iteration 15 is -279.110033109741
## The bic criterion for iteration 16 is -279.110033109741
## The bic criterion for iteration 17 is -279.110033109741
## The bic criterion for iteration 18 is -279.110033109741
## The bic criterion for iteration 19 is -279.110033109741
## The bic criterion for iteration 20 is -279.110033109741
## The bic criterion for iteration 21 is -279.110033109741
## The bic criterion for iteration 22 is -279.110033109741
## The bic criterion for iteration 23 is -279.110033109741
## The bic criterion for iteration 24 is -279.110033109741
## The bic criterion for iteration 25 is -279.110033109741
## The bic criterion for iteration 26 is -279.110033109741
## The bic criterion for iteration 27 is -279.110033109741
## The bic criterion for iteration 28 is -279.110033109741
## The bic criterion for iteration 29 is -279.110033109741
## The bic criterion for iteration 30 is -279.110033109741
## The bic criterion for iteration 31 is -279.110033109741
## The bic criterion for iteration 32 is -279.110033109741
## The bic criterion for iteration 33 is -279.110033109741
## The bic criterion for iteration 34 is -279.110033109741
## The bic criterion for iteration 35 is -279.110033109741
## The bic criterion for iteration 36 is -279.110033109741
## The bic criterion for iteration 37 is -279.110033109741
## The bic criterion for iteration 38 is -279.110033109741
## The bic criterion for iteration 39 is -279.110033109741
## The bic criterion for iteration 40 is -279.110033109741
## The bic criterion for iteration 41 is -279.110033109741
## The bic criterion for iteration 42 is -279.110033109741
## The bic criterion for iteration 43 is -279.110033109741
## The bic criterion for iteration 44 is -279.110033109741
## The bic criterion for iteration 45 is -279.110033109741
## The bic criterion for iteration 46 is -279.110033109741
## The bic criterion for iteration 47 is -279.110033109741
## The bic criterion for iteration 48 is -279.110033109741
## The bic criterion for iteration 49 is -279.110033109741
## The bic criterion for iteration 50 is -279.110033109741
## The bic criterion for iteration 51 is -279.110033109741
## The bic criterion for iteration 52 is -279.110033109741
## The bic criterion for iteration 53 is -279.110033109741
## The bic criterion for iteration 54 is -279.110033109741
## The bic criterion for iteration 55 is -279.110033109741
## The bic criterion for iteration 56 is -279.110033109741
## The bic criterion for iteration 57 is -279.110033109741
## The bic criterion for iteration 58 is -279.110033109741
## The bic criterion for iteration 59 is -279.110033109741
## The bic criterion for iteration 60 is -279.110033109741
## The bic criterion for iteration 61 is -279.110033109741
## The bic criterion for iteration 62 is -279.110033109741
## The bic criterion for iteration 63 is -279.110033109741
## The bic criterion for iteration 64 is -279.110033109741
## The bic criterion for iteration 65 is -279.110033109741
## The bic criterion for iteration 66 is -279.110033109741
## The bic criterion for iteration 67 is -279.110033109741
## The bic criterion for iteration 68 is -279.110033109741
## The bic criterion for iteration 69 is -279.110033109741
## The bic criterion for iteration 70 is -279.110033109741
## The bic criterion for iteration 71 is -279.110033109741
## The bic criterion for iteration 72 is -279.110033109741
## The bic criterion for iteration 73 is -279.110033109741
## The bic criterion for iteration 74 is -279.110033109741
## The bic criterion for iteration 75 is -279.110033109741
## The bic criterion for iteration 76 is -279.110033109741
## The bic criterion for iteration 77 is -279.110033109741
## The bic criterion for iteration 78 is -279.110033109741
## The bic criterion for iteration 79 is -279.110033109741
## The bic criterion for iteration 80 is -279.110033109741
## The bic criterion for iteration 81 is -279.110033109741
## The bic criterion for iteration 82 is -279.110033109741
## The bic criterion for iteration 83 is -279.110033109741
## The bic criterion for iteration 84 is -279.110033109741
## The bic criterion for iteration 85 is -279.110033109741
## The bic criterion for iteration 86 is -279.110033109741
## The bic criterion for iteration 87 is -279.110033109741
## The bic criterion for iteration 88 is -279.110033109741
## The bic criterion for iteration 89 is -279.110033109741
## The bic criterion for iteration 90 is -279.110033109741
## The bic criterion for iteration 91 is -279.110033109741
## The bic criterion for iteration 92 is -279.110033109741
## The bic criterion for iteration 93 is -279.110033109741
## The bic criterion for iteration 94 is -279.110033109741
## The bic criterion for iteration 95 is -279.110033109741
## The bic criterion for iteration 96 is -279.110033109741
## The bic criterion for iteration 97 is -279.110033109741
## The bic criterion for iteration 98 is -279.110033109741
## The bic criterion for iteration 99 is -279.110033109741
## The bic criterion for iteration 100 is -279.110033109741
## The bic criterion for iteration 101 is -279.110033109741
## The bic criterion for iteration 102 is -279.110033109741
## The bic criterion for iteration 103 is -279.110033109741
## The bic criterion for iteration 104 is -279.110033109741
## The bic criterion for iteration 105 is -279.110033109741
## The bic criterion for iteration 106 is -279.110033109741
## The bic criterion for iteration 107 is -279.110033109741
## The bic criterion for iteration 108 is -279.110033109741
## The bic criterion for iteration 109 is -279.110033109741
## The bic criterion for iteration 110 is -279.110033109741
## The bic criterion for iteration 111 is -279.110033109741
## The bic criterion for iteration 112 is -279.110033109741
## The bic criterion for iteration 113 is -279.110033109741
## The bic criterion for iteration 114 is -279.110033109741
## The bic criterion for iteration 115 is -279.110033109741
## The bic criterion for iteration 116 is -279.110033109741
## The bic criterion for iteration 117 is -279.110033109741
## The bic criterion for iteration 118 is -279.110033109741
## The bic criterion for iteration 119 is -279.110033109741
## The bic criterion for iteration 120 is -279.110033109741
## The bic criterion for iteration 121 is -279.110033109741
## The bic criterion for iteration 122 is -279.110033109741
## The bic criterion for iteration 123 is -279.110033109741
## The bic criterion for iteration 124 is -279.110033109741
## The bic criterion for iteration 125 is -279.110033109741
## The bic criterion for iteration 126 is -279.110033109741
## The bic criterion for iteration 127 is -279.110033109741
## The bic criterion for iteration 128 is -279.110033109741
## The bic criterion for iteration 129 is -279.110033109741
## The bic criterion for iteration 130 is -279.110033109741
## The bic criterion for iteration 131 is -279.110033109741
## The bic criterion for iteration 132 is -279.110033109741
## The bic criterion for iteration 133 is -279.110033109741
## The bic criterion for iteration 134 is -279.110033109741
## The bic criterion for iteration 135 is -279.110033109741
## The bic criterion for iteration 136 is -279.110033109741
## The bic criterion for iteration 137 is -279.110033109741
## The bic criterion for iteration 138 is -279.110033109741
## The bic criterion for iteration 139 is -279.110033109741
## The bic criterion for iteration 140 is -279.110033109741
## The bic criterion for iteration 141 is -279.110033109741
## The bic criterion for iteration 142 is -279.110033109741
## The bic criterion for iteration 143 is -279.110033109741
## The bic criterion for iteration 144 is -279.110033109741
## The bic criterion for iteration 145 is -279.110033109741
## The bic criterion for iteration 146 is -279.110033109741
## The bic criterion for iteration 147 is -279.110033109741
## The bic criterion for iteration 148 is -279.110033109741
## The bic criterion for iteration 149 is -279.110033109741
## The bic criterion for iteration 150 is -279.110033109741
## The bic criterion for iteration 151 is -279.110033109741
## The bic criterion for iteration 152 is -279.110033109741
## The bic criterion for iteration 153 is -279.110033109741
## The bic criterion for iteration 154 is -279.110033109741
## The bic criterion for iteration 155 is -279.110033109741
## The bic criterion for iteration 156 is -279.110033109741
## The bic criterion for iteration 157 is -279.110033109741
## The bic criterion for iteration 158 is -279.110033109741
## The bic criterion for iteration 159 is -279.110033109741
## The bic criterion for iteration 160 is -279.110033109741
## The bic criterion for iteration 161 is -279.110033109741
## The bic criterion for iteration 162 is -279.110033109741
## The bic criterion for iteration 163 is -279.110033109741
## The bic criterion for iteration 164 is -279.110033109741
## The bic criterion for iteration 165 is -279.110033109741
## The bic criterion for iteration 166 is -279.110033109741
## The bic criterion for iteration 167 is -279.110033109741
## The bic criterion for iteration 168 is -279.110033109741
## The bic criterion for iteration 169 is -279.110033109741
## The bic criterion for iteration 170 is -279.110033109741
## The bic criterion for iteration 171 is -279.110033109741
## The bic criterion for iteration 172 is -279.110033109741
## The bic criterion for iteration 173 is -279.110033109741
## The bic criterion for iteration 174 is -279.110033109741
## The bic criterion for iteration 175 is -279.110033109741
## The bic criterion for iteration 176 is -279.110033109741
## The bic criterion for iteration 177 is -279.110033109741
## The bic criterion for iteration 178 is -279.110033109741
## The bic criterion for iteration 179 is -279.110033109741
## The bic criterion for iteration 180 is -279.110033109741
## The bic criterion for iteration 181 is -279.110033109741
## The bic criterion for iteration 182 is -279.110033109741
## The bic criterion for iteration 183 is -279.110033109741
## The bic criterion for iteration 184 is -279.110033109741
## The bic criterion for iteration 185 is -279.110033109741
## The bic criterion for iteration 186 is -279.110033109741
## The bic criterion for iteration 187 is -279.110033109741
## The bic criterion for iteration 188 is -279.110033109741
## The bic criterion for iteration 189 is -279.110033109741
## The bic criterion for iteration 190 is -279.110033109741
## The bic criterion for iteration 191 is -279.110033109741
## The bic criterion for iteration 192 is -279.110033109741
## The bic criterion for iteration 193 is -279.110033109741
## The bic criterion for iteration 194 is -279.110033109741
## The bic criterion for iteration 195 is -279.110033109741
## The bic criterion for iteration 196 is -279.110033109741
## The bic criterion for iteration 197 is -279.110033109741
## The bic criterion for iteration 198 is -279.110033109741
## The bic criterion for iteration 199 is -279.110033109741
## The bic criterion for iteration 200 is -279.110033109741
plot(unlist(tree@performance$criterionEvolution), type="l")

data$c_map <- factor(apply(predict(tree@best.tree$tree,data,type="prob"),1,function(p) names(which.max(p))))
test$c_map <- factor(apply(predict(tree@best.tree$tree,test,type="prob"),1,function(p) names(which.max(p))))

table(data$c,data$c_map)
##    
##      1  9
##   1 34 34
##   2 66  0
##   3  0 66
plot(data[,1],data[,2],pch=2+data[,3],col=as.numeric(data$c_map),xlab="First coordinate",ylab="Second coordinate")

plot(tree@best.tree$tree)

pred = matrix(0, nrow = 0.2*1000, ncol = 1)

for (j in 1:nlevels(data$c_map)) {
  pred[test$c_map==levels(data$c_map)[j]] = predict(tree@best.tree$glms[[j]], test[test$c_map==levels(data$c_map)[j],], type="response")
}

normalizedGini(test$y,pred)
## [1] 0.4509215