knitr::opts_chunk$set(eval = FALSE)
library(MissImp)

In the first section, a complete mix-type dataset is first generated with 5 continuous variables, 1 integer variable and 2 categorical variables. Then given the desired missing data proportion and the missing data mechanism, based on the complete dataset, an incomplete dataset with missing values is created.

Then in the second section, several imputation methods are shown separately to replace the missing part with predictions in the generated incomplete dataset. For the categorical variable, both the final result and the predicted probability of each category are presented.

Generate data

We generate a complete data set \((Y_j)_{1 \leq j \leq 8}\) with \((Y_1, Y_2, Y_3) \sim \mathcal{N}(\mu_u1, \Sigma_1)\), \((Y_4, Y_5) \sim \mathcal{N}(\mu_2, \Sigma_2)\), \(Y_6 \sim P(\lambda)\) and \(Y_7, Y_8 \sim \text{Bin}(\gamma)\).

n <- 10000
complete_df_generator <- function(n) {
  mu.X <- c(1, 2, 3)
  Sigma.X <- matrix(c(
    9, 3, 2,
    3, 4, 0,
    2, 0, 1
  ), nrow = 3)
  X.complete.cont <- MASS::mvrnorm(n, mu.X, Sigma.X) # multivariate normal distribution

  mu1.X <- c(9, 8)
  Sigma1.X <- matrix(c(
    16, 14,
    14, 25
  ), nrow = 2)

  X.complete.cont1 <- MASS::mvrnorm(n, mu1.X, Sigma1.X) # multivariate normal distribution

  lambda <- 4.3
  X.complete.discr <- stats::rpois(n, lambda) # poisson distribution

  X.complete.cat <- stats::rbinom(n, size = 5, prob = 0.4) # binomial

  X.complete.cat2 <- stats::rbinom(n, size = 7, prob = 0.6) # binomial

  X.complete <- data.frame(cbind(X.complete.cont, X.complete.cont1,
    X.complete.discr,
    ... = X.complete.cat, X.complete.cat2
  ))
  X.complete[, 7] <- as.factor(X.complete[, 7])
  levels(X.complete[, 7]) <- c("F", "E", "D", "C", "B", "A")
  X.complete[, 8] <- as.factor(X.complete[, 8])
  colnames(X.complete) <- c("Y1", "Y2", "Y3", "Y4", "Y5", "Y6", "Y7", "Y8")
  return(X.complete)
}

We can generate incomplete dataframe from the complete dataframe, with a chosen percentage of missing data and a chosen missing mechanism. The difference between mechanisms is explained in the documentation of generate_miss.

n <- 1000
mech <- "MAR1"
miss_prop <- 0.4
X.complete <- complete_df_generator(n)
rs <- generate_miss(X.complete, miss_prop, mechanism = mech)
df <- rs$X.incomp

Single Imputation

EM

df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
imp.em <- em(df_with_mv, col_cat = col_cat)
head(imp.em$ximp)
head(imp.em$ximp.disj)

Missforest

df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
imp.forest <- missForest(xmis = df, maxiter = 3, ntree = 100, col_cat = col_cat)
head(imp.forest$ximp)
head(imp.forest$ximp.disj)

MissRanger

df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
imp.ranger <- missRanger(df_with_mv, col_cat = c(7:8), maxiter = 3)

head(imp.ranger$ximp)
head(imp.ranger$ximp.disj)

MissMDA

df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
imp.pca <- imputeFAMD(df_with_mv, ncp = 3, maxiter = 30)
head(imp.pca$completeObs)
head(imp.pca$tab.disj)

KNN

df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
imp.knn <- kNN(df_with_mv, col_cat = c(7:8))
head(imp.knn$ximp)
head(imp.knn$ximp.disj)

Multiple Imputation

MI EM Amelia

df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
imp.em.mi <- MI_EM_amelia(df_with_mv, col_num = c(1:6), col_cat = col_cat, num_imp = 3)
head(imp.em.mi$ximp)
head(imp.em.mi$ximp.disj)

MI mice

df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
res <- mice::mice(df_with_mv, m = 3)
imp.mice <- result_mice(res, 3, col_cat = c(7, 8))

head(imp.mice$ximp)
head(imp.mice$ximp.disj)

MI PCA

df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
imp.pca.mi <- MIFAMD(df_with_mv, ncp = 3, maxiter = 50, nboot = 3)
head(imp.pca.mi$ximp)
head(imp.pca.mi$ximp.disj)

MI MissRanger

df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
imp.ranger.mi <- MI_missRanger(df_with_mv, col_cat = c(7, 8), num_mi = 3)

head(imp.ranger.mi$ximp)
head(imp.ranger.mi$ximp.disj)