C_Each_Imputation_Method.Rmd
knitr::opts_chunk$set(eval = FALSE)
library(MissImp)
In the first section, a complete mix-type dataset is first generated with 5 continuous variables, 1 integer variable and 2 categorical variables. Then given the desired missing data proportion and the missing data mechanism, based on the complete dataset, an incomplete dataset with missing values is created.
Then in the second section, several imputation methods are shown separately to replace the missing part with predictions in the generated incomplete dataset. For the categorical variable, both the final result and the predicted probability of each category are presented.
We generate a complete data set \((Y_j)_{1 \leq j \leq 8}\) with \((Y_1, Y_2, Y_3) \sim \mathcal{N}(\mu_u1, \Sigma_1)\), \((Y_4, Y_5) \sim \mathcal{N}(\mu_2, \Sigma_2)\), \(Y_6 \sim P(\lambda)\) and \(Y_7, Y_8 \sim \text{Bin}(\gamma)\).
n <- 10000
complete_df_generator <- function(n) {
mu.X <- c(1, 2, 3)
Sigma.X <- matrix(c(
9, 3, 2,
3, 4, 0,
2, 0, 1
), nrow = 3)
X.complete.cont <- MASS::mvrnorm(n, mu.X, Sigma.X) # multivariate normal distribution
mu1.X <- c(9, 8)
Sigma1.X <- matrix(c(
16, 14,
14, 25
), nrow = 2)
X.complete.cont1 <- MASS::mvrnorm(n, mu1.X, Sigma1.X) # multivariate normal distribution
lambda <- 4.3
X.complete.discr <- stats::rpois(n, lambda) # poisson distribution
X.complete.cat <- stats::rbinom(n, size = 5, prob = 0.4) # binomial
X.complete.cat2 <- stats::rbinom(n, size = 7, prob = 0.6) # binomial
X.complete <- data.frame(cbind(X.complete.cont, X.complete.cont1,
X.complete.discr,
... = X.complete.cat, X.complete.cat2
))
X.complete[, 7] <- as.factor(X.complete[, 7])
levels(X.complete[, 7]) <- c("F", "E", "D", "C", "B", "A")
X.complete[, 8] <- as.factor(X.complete[, 8])
colnames(X.complete) <- c("Y1", "Y2", "Y3", "Y4", "Y5", "Y6", "Y7", "Y8")
return(X.complete)
}
We can generate incomplete dataframe from the complete dataframe, with a chosen percentage of missing data and a chosen missing mechanism. The difference between mechanisms is explained in the documentation of generate_miss
.
n <- 1000
mech <- "MAR1"
miss_prop <- 0.4
X.complete <- complete_df_generator(n)
rs <- generate_miss(X.complete, miss_prop, mechanism = mech)
df <- rs$X.incomp
df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
imp.em <- em(df_with_mv, col_cat = col_cat)
head(imp.em$ximp)
head(imp.em$ximp.disj)
df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
imp.forest <- missForest(xmis = df, maxiter = 3, ntree = 100, col_cat = col_cat)
head(imp.forest$ximp)
head(imp.forest$ximp.disj)
df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
imp.ranger <- missRanger(df_with_mv, col_cat = c(7:8), maxiter = 3)
head(imp.ranger$ximp)
head(imp.ranger$ximp.disj)
df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
imp.pca <- imputeFAMD(df_with_mv, ncp = 3, maxiter = 30)
head(imp.pca$completeObs)
head(imp.pca$tab.disj)
df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
imp.em.mi <- MI_EM_amelia(df_with_mv, col_num = c(1:6), col_cat = col_cat, num_imp = 3)
head(imp.em.mi$ximp)
head(imp.em.mi$ximp.disj)
df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
res <- mice::mice(df_with_mv, m = 3)
imp.mice <- result_mice(res, 3, col_cat = c(7, 8))
head(imp.mice$ximp)
head(imp.mice$ximp.disj)
df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
imp.pca.mi <- MIFAMD(df_with_mv, ncp = 3, maxiter = 50, nboot = 3)
head(imp.pca.mi$ximp)
head(imp.pca.mi$ximp.disj)
df_with_mv <- df
col_cat <- c(7, 8)
df_with_mv <- factor_ordinal_encode(df_with_mv, col_cat)
imp.ranger.mi <- MI_missRanger(df_with_mv, col_cat = c(7, 8), num_mi = 3)
head(imp.ranger.mi$ximp)
head(imp.ranger.mi$ximp.disj)