3 Simple de novo simulation

Simulation of different simple scenarios and missingness mitigation approaches

Author

Janick Weberpals

Published

November 1, 2024

3.1 Objective

In this section, we will generate 1000 following different missingness scenarios.

# define column numbers for pattern and weight determination
col_Z2 <- which(colnames(mock_cohort) %in% "Z2")
col_U <- which(colnames(mock_cohort) %in% "U")

# define missingness pattern
default_pattern <- rep(1, ncol(mock_cohort))
pattern <- replace(default_pattern, col_Z2, 0)

# define weights 
default_weights <- rep(0, ncol(mock_cohort))
weights <- replace(default_weights, c(col_Z2, col_U), c(0.2, 0.8))

prop <- 0.5

3.2 Continuous distribution probabilities

Probabilities are based on a continuous distribution.

tic(msg = "Simulation based on continuous distribution")

results <- parallel::mclapply(
  X = 1:n_replicates,
  FUN = run_simulation,
  pattern = pattern,
  weights = weights,
  type = "RIGHT",
  prop = prop,
  mc.cores = parallel::detectCores()-1
  ) 
  
results_cont <- do.call(rbind, results) |> 
  mutate(simulation = "Continuous distribution (RIGHT)")

toc()

Simulation based on continuous distribution: 1410.097 sec elapsed

3.3 Discrete distribution probabilities

Probabilities are based on a discrete distribution.

tic(msg = "Simulation based on discrete distribution")

odds <- c(1, 2, 3, 4)

results <- parallel::mclapply(
  X = 1:n_replicates,
  FUN = run_simulation,
  pattern = pattern,
  weights = weights,
  odds = odds,
  prop = prop,
  mc.cores = parallel::detectCores()-1
  ) 
  
results_odds <- do.call(rbind, results) |> 
  mutate(simulation = paste0("Discrete odds (", paste0(odds, collapse = ", "), ")"))

toc()

Simulation based on discrete distribution: 1381.326 sec elapsed

3.4 Save simulation results

results <- rbind(results_cont, results_odds) |> 
  mutate(method = factor(method, levels = c(
    "Unadjusted", 
    "Complete case", 
    "Imputed"))
    )

3.5 Results

The next steps of this script analyze the raw simulation results obtained in the previous script via run_simulation.R. The last run on 2024-11-01 18:20:43.222866.

3.6 Read raw results table

We first look at the results table with the raw simulation results.

results |> 
  glimpse()

Rows: 6,000
Columns: 5
$ method     <fct> Unadjusted, Complete case, Imputed, Unadjusted, Complete ca…
$ estimate   <dbl> 1.2629630, 0.8932111, 0.9111699, 1.1150121, 0.8824297, 1.03…
$ se         <dbl> 0.07923421, 0.13274238, 0.10924959, 0.07479571, 0.11677905,…
$ replicate  <int> 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7,…
$ simulation <chr> "Continuous distribution (RIGHT)", "Continuous distribution…

3.7 QC

Let’s do a few quality/sanity checks.

Number of analysis methods

unique(results$method)

[1] Unadjusted    Complete case Imputed      
Levels: Unadjusted Complete case Imputed

Assert that there are no missing results

assert_that(!any(sapply(results$estimate, is.na)), msg = "There are missing estimates")

[1] TRUE

assert_that(!any(sapply(results$se, is.na)), msg = "There are missing standard errors")

[1] TRUE

3.8 Main results

# call helper functions
source(here::here("functions", "rsimsum_ggplot.R"))

For the analysis of aggregate simulation results we use the rsimsum package. More information about this package can be found here.[@rsimsum]

simsum_out <- simsum(
  data = results, 
  estvarname = "estimate", 
  se = "se", 
  true = 1, 
  by = "simulation",
  methodvar = "method", 
  ref = "Complete case"
  ) |> 
  summary() |> 
  tidy()

Hazard ratio distribution
Hazard ratio distribution (interactive)

hr_distribution <- results |> 
  ggplot(aes(x = method, y = estimate)) +
  geom_boxplot(outlier.colour = "red") +
  geom_point(position = position_jitter(seed = 42), alpha = 0.15) +
  geom_hline(yintercept = 1.0, color = "forestgreen", linetype = "dashed") +
  labs(
    x = "Method",
    y = "Hazard ratio (HR)"
    ) +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 35, vjust = 1, hjust=1)
    ) +
  facet_wrap(~simulation)

hr_distribution

Figure 3.1: Hazard ratio distribution by method across all simulated datasets.

ggplotly(hr_distribution)

3.8.1 Root mean squared error (RMSE)

rmse <- rsimsum_ggplot(tidy_simsum = simsum_out, metric = "rmse")

Plot
Table

rmse$plot

Figure 3.2: Root mean squared error by method.

rmse$table |> 
  gt()

Table 3.1: Root mean squared error by method.

method	est	lower	upper
Continuous distribution (RIGHT)
Unadjusted	0.207	0.200	0.213
Complete case	0.187	0.178	0.196
Imputed	0.163	0.156	0.170
Discrete odds (1, 2, 3, 4)
Unadjusted	0.207	0.200	0.213
Complete case	0.201	0.188	0.212
Imputed	0.129	0.121	0.136

3.8.2 Bias

bias <- rsimsum_ggplot(tidy_simsum = simsum_out, metric = "bias")

Plot
Table

bias$plot

bias$table |> 
  gt()

Table 3.2: Bias by method.

method	est	lower	upper
Continuous distribution (RIGHT)
Unadjusted	0.184	0.178	0.189
Complete case	0.103	0.093	0.113
Imputed	0.100	0.092	0.108
Discrete odds (1, 2, 3, 4)
Unadjusted	0.184	0.178	0.189
Complete case	0.102	0.091	0.113
Imputed	0.058	0.051	0.065

3.8.3 Coverage

coverage <- rsimsum_ggplot(tidy_simsum = simsum_out, metric = "coverage")

Plot
Table

coverage$plot

coverage$table |> 
  gt()

Table 3.3: Coverage by method.

method	est	lower	upper
Continuous distribution (RIGHT)
Unadjusted	0.387	0.357	0.417
Complete case	0.819	0.795	0.843
Imputed	0.844	0.822	0.866
Discrete odds (1, 2, 3, 4)
Unadjusted	0.387	0.357	0.417
Complete case	0.842	0.819	0.865
Imputed	0.938	0.923	0.953

3.8.4 Empirical standard error

empse <- rsimsum_ggplot(tidy_simsum = simsum_out, metric = "empse")

Plot
Table

empse$plot

Figure 3.5: Empirical standard error by method.

empse$table |> 
  gt()

Table 3.4: Empirical standard error by method.

method	est	lower	upper
Continuous distribution (RIGHT)
Unadjusted	0.095	0.091	0.099
Complete case	0.157	0.150	0.163
Imputed	0.129	0.123	0.135
Discrete odds (1, 2, 3, 4)
Unadjusted	0.095	0.091	0.099
Complete case	0.173	0.165	0.180
Imputed	0.115	0.110	0.120

3.9 Session info

Total script runtime: 46.57 minutes.

pander::pander(subset(data.frame(sessioninfo::package_info()), attached==TRUE, c(package, loadedversion)))

	package	loadedversion
assertthat	assertthat	0.2.1
dplyr	dplyr	1.1.4
ggplot2	ggplot2	3.4.4
gt	gt	0.10.1
here	here	1.0.1
plotly	plotly	4.10.4
rsimsum	rsimsum	0.11.3
tictoc	tictoc	1.2
tidyr	tidyr	1.3.1

pander::pander(sessionInfo())

R version 4.4.0 (2024-04-24)

Platform: x86_64-pc-linux-gnu

locale: LC_CTYPE=C.UTF-8, LC_NUMERIC=C, LC_TIME=C.UTF-8, LC_COLLATE=C.UTF-8, LC_MONETARY=C.UTF-8, LC_MESSAGES=C.UTF-8, LC_PAPER=C.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=C.UTF-8 and LC_IDENTIFICATION=C

attached base packages: parallel, stats, graphics, grDevices, datasets, utils, methods and base

other attached packages: plotly(v.4.10.4), tidyr(v.1.3.1), assertthat(v.0.2.1), rsimsum(v.0.11.3), gt(v.0.10.1), ggplot2(v.3.4.4), tictoc(v.1.2), dplyr(v.1.1.4) and here(v.1.0.1)

loaded via a namespace (and not attached): sass(v.0.4.8), utf8(v.1.2.4), generics(v.0.1.3), renv(v.1.0.3), xml2(v.1.3.6), digest(v.0.6.33), magrittr(v.2.0.3), evaluate(v.0.23), grid(v.4.4.0), fastmap(v.1.1.1), rprojroot(v.2.0.4), jsonlite(v.1.8.8), sessioninfo(v.1.2.2), backports(v.1.4.1), httr(v.1.4.7), pander(v.0.6.5), purrr(v.1.0.2), fansi(v.1.0.6), crosstalk(v.1.2.1), viridisLite(v.0.4.2), scales(v.1.3.0), lazyeval(v.0.2.2), cli(v.3.6.2), rlang(v.1.1.2), ellipsis(v.0.3.2), munsell(v.0.5.0), withr(v.3.0.0), yaml(v.2.3.8), tools(v.4.4.0), checkmate(v.2.3.1), colorspace(v.2.1-0), vctrs(v.0.6.5), R6(v.2.5.1), ggridges(v.0.5.6), lifecycle(v.1.0.4), htmlwidgets(v.1.6.4), simsurv(v.1.0.0), pkgconfig(v.2.0.3), pillar(v.1.9.0), gtable(v.0.3.4), Rcpp(v.1.0.12), glue(v.1.6.2), data.table(v.1.15.0), xfun(v.0.41), tibble(v.3.2.1), tidyselect(v.1.2.0), knitr(v.1.45), farver(v.2.1.1), htmltools(v.0.5.7), rmarkdown(v.2.28), labeling(v.0.4.3) and compiler(v.4.4.0)

pander::pander(options('repos'))

repos:

REPO_NAME

https://packagemanager.posit.co/cran/latest