| 1 |
#' Calculate Standardized Mean Difference (SMD) Between RCT and RWE Results |
|
| 2 |
#' |
|
| 3 |
#' @description |
|
| 4 |
#' This function calculates the standardized mean difference (SMD) between results from |
|
| 5 |
#' a randomized controlled trial (RCT) and its real-world evidence (RWE) emulation. |
|
| 6 |
#' The SMD provides a measure of agreement between the two results, taking into account |
|
| 7 |
#' both the point estimates and their corresponding uncertainty. |
|
| 8 |
#' |
|
| 9 |
#' @details |
|
| 10 |
#' The SMD is calculated as the difference between RCT and RWE estimates, divided by |
|
| 11 |
#' the square root of the sum of their variances. Variances are derived from the |
|
| 12 |
#' confidence intervals assuming normal distribution (using 1.96 for 95% CI). |
|
| 13 |
#' |
|
| 14 |
#' The formula used is: |
|
| 15 |
#' SMD = (θ_RCT - θ_RWE) / sqrt(Var(θ_RCT) + Var(θ_RWE)) |
|
| 16 |
#' |
|
| 17 |
#' @param rct_estimate Numeric. Point estimate from the RCT (typically log hazard ratio or log odds ratio) |
|
| 18 |
#' @param rct_lower Numeric. Lower bound of the confidence interval from RCT |
|
| 19 |
#' @param rct_upper Numeric. Upper bound of the confidence interval from RCT |
|
| 20 |
#' @param rwe_estimate Numeric. Point estimate from the RWE study |
|
| 21 |
#' @param rwe_lower Numeric. Lower bound of the confidence interval from RWE |
|
| 22 |
#' @param rwe_upper Numeric. Upper bound of the confidence interval from RWE |
|
| 23 |
#' |
|
| 24 |
#' @return |
|
| 25 |
#' Returns a numeric value representing the standardized mean difference. |
|
| 26 |
#' Larger absolute values indicate greater disagreement between RCT and RWE results. |
|
| 27 |
#' |
|
| 28 |
#' @examples |
|
| 29 |
#' # Example with hazard ratios (log scale) |
|
| 30 |
#' smd <- smd_agreement( |
|
| 31 |
#' rct_estimate = log(0.87), |
|
| 32 |
#' rct_lower = log(0.78), |
|
| 33 |
#' rct_upper = log(0.97), |
|
| 34 |
#' rwe_estimate = log(0.82), |
|
| 35 |
#' rwe_lower = log(0.76), |
|
| 36 |
#' rwe_upper = log(0.87) |
|
| 37 |
#' ) |
|
| 38 |
#' |
|
| 39 |
#' @references |
|
| 40 |
#' For methodology on comparing RCT and RWE results using standardized differences. |
|
| 41 |
#' |
|
| 42 |
#' @export |
|
| 43 |
smd_agreement <- function(rct_estimate, |
|
| 44 |
rct_lower, |
|
| 45 |
rct_upper, |
|
| 46 |
rwe_estimate, |
|
| 47 |
rwe_lower, |
|
| 48 |
rwe_upper |
|
| 49 |
){
|
|
| 50 | ||
| 51 |
# Input validation |
|
| 52 | 19x |
required_args <- c("rct_estimate", "rct_lower", "rct_upper",
|
| 53 | 19x |
"rwe_estimate", "rwe_lower", "rwe_upper") |
| 54 | ||
| 55 |
# Check for missing values |
|
| 56 | 19x |
missing_args <- required_args[sapply(list(rct_estimate, rct_lower, rct_upper, |
| 57 | 19x |
rwe_estimate, rwe_lower, rwe_upper), |
| 58 | 19x |
is.null)] |
| 59 | 18x |
if (length(missing_args) > 0) {
|
| 60 | ! |
stop(sprintf("Missing required arguments: %s",
|
| 61 | ! |
paste(missing_args, collapse = ", "))) |
| 62 |
} |
|
| 63 | ||
| 64 |
# Check numeric inputs |
|
| 65 | 18x |
if (!all(sapply(list(rct_estimate, rct_lower, rct_upper, |
| 66 | 18x |
rwe_estimate, rwe_lower, rwe_upper), is.numeric))) {
|
| 67 | 1x |
stop("All inputs must be numeric values")
|
| 68 |
} |
|
| 69 | ||
| 70 |
# Check bounds |
|
| 71 | 17x |
if (rct_lower >= rct_upper) {
|
| 72 | 1x |
stop("RCT lower bound must be less than upper bound")
|
| 73 |
} |
|
| 74 | 16x |
if (rwe_lower >= rwe_upper) {
|
| 75 | ! |
stop("RWE lower bound must be less than upper bound")
|
| 76 |
} |
|
| 77 | ||
| 78 |
# Check if estimate is within bounds |
|
| 79 | 16x |
if (rct_estimate < rct_lower || rct_estimate > rct_upper) {
|
| 80 | 1x |
warning("RCT estimate is outside its confidence interval bounds")
|
| 81 |
} |
|
| 82 | 16x |
if (rwe_estimate < rwe_lower || rwe_estimate > rwe_upper) {
|
| 83 | ! |
warning("RWE estimate is outside its confidence interval bounds")
|
| 84 |
} |
|
| 85 | ||
| 86 |
# numerator |
|
| 87 | 16x |
numerator <- (rct_estimate - rwe_estimate) |
| 88 | ||
| 89 |
# variances |
|
| 90 | 16x |
var_rct <- (rct_upper - rct_lower)/(2*1.96) |
| 91 | 16x |
var_rwe <- (rwe_upper - rwe_lower)/(2*1.96) |
| 92 | ||
| 93 |
# denominator |
|
| 94 | 16x |
denominator <- sqrt(var_rct^2 + var_rwe^2) |
| 95 | ||
| 96 |
# Final check for valid calculation |
|
| 97 | 16x |
if (is.infinite(denominator) || is.nan(denominator)) {
|
| 98 | ! |
stop("Invalid calculation: denominator is infinite or NaN")
|
| 99 |
} |
|
| 100 | ||
| 101 |
# smd |
|
| 102 | 16x |
smd <- numerator/denominator |
| 103 | ||
| 104 | 16x |
return(smd) |
| 105 | ||
| 106 |
} |
| 1 |
#' Simulates an artifical EHR-derived analysis-ready oncology dataset |
|
| 2 |
#' |
|
| 3 |
#' @description |
|
| 4 |
#' Parameterized function to quickly create an |
|
| 5 |
#' EHR-derived analytic cohort for analytic code development. |
|
| 6 |
#' |
|
| 7 |
#' @details |
|
| 8 |
#' This function simulates a cohort of patients with |
|
| 9 |
#' oncology data. The cohort is simulated using a Weibull distribution for time to event |
|
| 10 |
#' and a logistic distribution for treatment assignment. |
|
| 11 |
#' The function also allows for missingness to be imposed on the data. |
|
| 12 |
#' The function is parameterized to allow for the number of patients |
|
| 13 |
#' to be simulated, the seed for reproducibility, and whether to |
|
| 14 |
#' include a patient id variable. |
|
| 15 |
#' |
|
| 16 |
#' @param n_total integer, number of total patients |
|
| 17 |
#' @param seed integer, seed for reproducibility |
|
| 18 |
#' @param include_id logical, include a generated patientid variable |
|
| 19 |
#' @param imposeNA logical, set covariates to missing |
|
| 20 |
#' @param propNA numeric, proportion of missingness, needs to be between 0 and 1 |
|
| 21 |
#' |
|
| 22 |
#' @return data frame with simulated analytic cohort |
|
| 23 |
#' |
|
| 24 |
#' @export |
|
| 25 |
#' |
|
| 26 |
#' @examples |
|
| 27 |
#' \dontrun{
|
|
| 28 |
#' library(encore.analytics) |
|
| 29 |
#' |
|
| 30 |
#' data_miss <- simulate_data( |
|
| 31 |
#' n_total = 3500, |
|
| 32 |
#' seed = 41, |
|
| 33 |
#' include_id = FALSE, |
|
| 34 |
#' imposeNA = TRUE, |
|
| 35 |
#' propNA = .33 |
|
| 36 |
#' ) |
|
| 37 |
#' |
|
| 38 |
#' head(data_miss) |
|
| 39 |
#' |
|
| 40 |
#'} |
|
| 41 | ||
| 42 |
simulate_data <- function(n_total = 3500, |
|
| 43 |
seed = 42, |
|
| 44 |
include_id = TRUE, |
|
| 45 |
imposeNA = TRUE, |
|
| 46 |
propNA = NULL |
|
| 47 |
){
|
|
| 48 | ||
| 49 |
# check input |
|
| 50 | 20x |
assertthat::assert_that(inherits(n_total, "numeric"), msg = "<n_total> needs to be an integer") |
| 51 | 20x |
assertthat::assert_that(n_total > 10, msg = "<n_total> needs to be larger than 10 for meaningful output") |
| 52 | 19x |
assertthat::assert_that(inherits(seed, "numeric"), msg = "<seed> needs to be an integer") |
| 53 | 19x |
assertthat::assert_that(inherits(include_id, "logical"), msg = "<seed> needs to be a logical") |
| 54 | 19x |
assertthat::assert_that(inherits(imposeNA, "logical"), msg = "<imposeNA> needs to be a logical") |
| 55 | 11x |
if(!is.null(propNA)) assertthat::assert_that(inherits(propNA, "numeric"), msg = "<propNA> needs to be numeric") |
| 56 | 11x |
if(!is.null(propNA)) assertthat::assert_that(propNA >= 0 & propNA <= 1, msg = "<propNA> needs to be a numeric between 0 and 1") |
| 57 | 10x |
if(!is.null(propNA)) assertthat::assert_that(imposeNA == TRUE, msg = "<propNA> is specified but imposeNA is FALSE") |
| 58 | ||
| 59 |
# set the seed |
|
| 60 | 17x |
set.seed(seed) |
| 61 | ||
| 62 |
# Build the total cohort |
|
| 63 | 17x |
cohort <- tibble::tibble( |
| 64 |
# patient id |
|
| 65 | 17x |
patientid = paste0("Patient", seq(1, n_total, 1)),
|
| 66 |
# age |
|
| 67 | 17x |
dem_age_index_cont = stats::rnorm(n = n_total, mean = 69, sd = 8), |
| 68 |
# sex/male |
|
| 69 | 17x |
dem_sex_cont = stats::rbinom(n = n_total, size = 1, prob = .33), |
| 70 |
# smoking |
|
| 71 | 17x |
c_smoking_history = sample(c(TRUE, FALSE), size = n_total, replace = TRUE, prob = c(.44, .56)), |
| 72 |
# num met sites |
|
| 73 | 17x |
c_number_met_sites = sample(c(1, 2, 3, 4, 5), size = n_total, replace = TRUE, prob = c(.73, .22, .035, .006, .001)), |
| 74 |
# c_ecog_cont |
|
| 75 | 17x |
c_ecog_cont = stats::rbinom(n = n_total, size = 1, prob = .57), |
| 76 |
# c_stage_initial_dx_cont |
|
| 77 | 17x |
c_stage_initial_dx_cont = sample(c(4, 3, 2, 1), size = n_total, replace = TRUE, prob = c(.76, 0.2, 0.028, 0.012)), |
| 78 |
# dem_race |
|
| 79 | 17x |
dem_race = factor(sample(c("Asian", "White", "Other"), size = n_total, replace = TRUE, prob = c(.36, .62, .02))),
|
| 80 |
# dem_region |
|
| 81 | 17x |
dem_region = factor(sample(c("Midwest", "Northeast", "South", "West"), size = n_total, replace = TRUE, prob = c(.14, .20, .39, .27))),
|
| 82 |
# dem_ses |
|
| 83 | 17x |
dem_ses = sample(c(1, 2, 3, 4, 5), size = n_total, replace = TRUE, prob = c(.12, .15, .19, .24, .29)), |
| 84 |
# c_hemoglobin_g_dl_cont |
|
| 85 | 17x |
c_hemoglobin_g_dl_cont = stats::rnorm(n = n_total, mean = 12.90, sd = 1.3), |
| 86 |
# c_urea_nitrogen_mg_dl_cont |
|
| 87 | 17x |
c_urea_nitrogen_mg_dl_cont = stats::rnorm(n = n_total, mean = 2.71, sd = 0.3), |
| 88 |
# c_platelets_10_9_l_cont |
|
| 89 | 17x |
c_platelets_10_9_l_cont = stats::rnorm(n = n_total, mean = 259, sd = 53), |
| 90 |
# c_calcium_mg_dl_cont |
|
| 91 | 17x |
c_calcium_mg_dl_cont = stats::rnorm(n = n_total, mean = 2.23, sd = 0.04), |
| 92 |
# c_glucose_mg_dl_cont |
|
| 93 | 17x |
c_glucose_mg_dl_cont = stats::rnorm(n = n_total, mean = 4.64, sd = 0.11), |
| 94 |
# c_lymphocyte_leukocyte_ratio_cont |
|
| 95 | 17x |
c_lymphocyte_leukocyte_ratio_cont = stats::rnorm(n = n_total, mean = 2.87, sd = 0.17), |
| 96 |
# c_alp_u_l_cont |
|
| 97 | 17x |
c_alp_u_l_cont = stats::rnorm(n = n_total, mean = 4.48, sd = 0.13), |
| 98 |
# c_protein_g_l_cont |
|
| 99 | 17x |
c_protein_g_l_cont = stats::rnorm(n = n_total, mean = 68, sd = 4), |
| 100 |
# c_alt_u_l_cont |
|
| 101 | 17x |
c_alt_u_l_cont = stats::rnorm(n = n_total, mean = 2.89, sd = 0.33), |
| 102 |
# c_albumin_g_l_cont |
|
| 103 | 17x |
c_albumin_g_l_cont = stats::rnorm(n = n_total, mean = 40, sd = 3), |
| 104 |
# c_bilirubin_mg_dl_cont |
|
| 105 | 17x |
c_bilirubin_mg_dl_cont = stats::rnorm(n = n_total, mean = -0.92, sd = 1.28), |
| 106 |
# c_chloride_mmol_l_cont |
|
| 107 | 17x |
c_chloride_mmol_l_cont = stats::rnorm(n = n_total, mean = 102.0, sd = 3), |
| 108 |
# c_monocytes_10_9_l_cont |
|
| 109 | 17x |
c_monocytes_10_9_l_cont = stats::rnorm(n = n_total, mean = -0.51, sd = 0.25), |
| 110 |
# c_eosinophils_leukocytes_ratio_cont |
|
| 111 | 17x |
c_eosinophils_leukocytes_ratio_cont = stats::rnorm(n = n_total, mean = 0.69, sd = 0.59), |
| 112 |
# c_ldh_u_l_cont |
|
| 113 | 17x |
c_ldh_u_l_cont = stats::rnorm(n = n_total, mean = 1.69, sd = 0.05), |
| 114 |
# c_hr_cont |
|
| 115 | 17x |
c_hr_cont = stats::rnorm(n = n_total, mean = 4.43, sd = 0.04), |
| 116 |
# c_sbp_cont |
|
| 117 | 17x |
c_sbp_cont = stats::rnorm(n = n_total, mean = 4.85, sd = 0.09), |
| 118 |
# c_oxygen_cont |
|
| 119 | 17x |
c_oxygen_cont = stats::rnorm(n = n_total, mean = 97.00, sd = 0.01), |
| 120 |
# c_neutrophil_lymphocyte_ratio_cont |
|
| 121 | 17x |
c_neutrophil_lymphocyte_ratio_cont = stats::rnorm(n = n_total, mean = 1.29, sd = 0.38), |
| 122 |
# c_bmi_cont |
|
| 123 | 17x |
c_bmi_cont = stats::rnorm(n = n_total, mean = 3.23, sd = 0.13), |
| 124 |
# c_ast_alt_ratio_cont |
|
| 125 | 17x |
c_ast_alt_ratio_cont = stats::rnorm(n = n_total, mean = 0.12, sd = 0.29), |
| 126 |
# c_time_dx_to_index |
|
| 127 | 17x |
c_time_dx_to_index = stats::rnorm(n = n_total, mean = 44, sd = 17), |
| 128 |
# c_de_novo_mets_dx |
|
| 129 | 17x |
c_de_novo_mets_dx = sample(c(TRUE, FALSE), size = n_total, replace = TRUE, prob = c(.79, .21)), |
| 130 |
# c_height_cont |
|
| 131 | 17x |
c_height_cont = stats::rnorm(n = n_total, mean = 1.65, sd = 0.08), |
| 132 |
# c_weight_cont |
|
| 133 | 17x |
c_weight_cont = stats::rnorm(n = n_total, mean = 69, sd = 11), |
| 134 |
# c_dbp_cont |
|
| 135 | 17x |
c_dbp_cont = stats::rnorm(n = n_total, mean = 76, sd = 6), |
| 136 |
# c_year_index |
|
| 137 | 17x |
c_year_index = factor(sample(c("<2018", "2018+"), size = n_total, replace = TRUE, prob = c(.95, .05)))
|
| 138 |
) |
|
| 139 | ||
| 140 |
# assign treatment probabilities |
|
| 141 | 17x |
cohort <- cohort |> |
| 142 | 17x |
dplyr::mutate( |
| 143 |
# treat (to simplify categorical variables don't get betas) |
|
| 144 | 17x |
treat = stats::rbinom( |
| 145 | 17x |
n = n_total, |
| 146 | 17x |
size = 1, |
| 147 | 17x |
prob = locfit::expit( |
| 148 | 17x |
-5.8 + |
| 149 | 17x |
dem_age_index_cont * log(1.0071) + |
| 150 | 17x |
dem_sex_cont * log(0.9804) + |
| 151 | 17x |
c_smoking_history * log(0.7065) + |
| 152 | 17x |
c_number_met_sites * log(0.9982) + |
| 153 | 17x |
c_hemoglobin_g_dl_cont * log(1.0483) + |
| 154 | 17x |
c_urea_nitrogen_mg_dl_cont * log(0.9198) + |
| 155 | 17x |
c_platelets_10_9_l_cont * log(0.9998) + |
| 156 | 17x |
c_calcium_mg_dl_cont * log(0.9118) + |
| 157 | 17x |
c_glucose_mg_dl_cont * log(1.0954) + |
| 158 | 17x |
c_lymphocyte_leukocyte_ratio_cont * log(1.1192) + |
| 159 | 17x |
c_alp_u_l_cont * log(1.3126) + |
| 160 | 17x |
c_protein_g_l_cont * log(1.0002) + |
| 161 | 17x |
c_alt_u_l_cont * log(0.9679) + |
| 162 | 17x |
c_albumin_g_l_cont * log(1.0296) + |
| 163 | 17x |
c_bilirubin_mg_dl_cont * log(0.9126) + |
| 164 | 17x |
c_chloride_mmol_l_cont * log(1.0076) + |
| 165 | 17x |
c_monocytes_10_9_l_cont * log(1.3919) + |
| 166 | 17x |
c_eosinophils_leukocytes_ratio_cont * log(0.9567) + |
| 167 | 17x |
c_ldh_u_l_cont * log(0.8804) + |
| 168 | 17x |
c_hr_cont * log(1.2264) + |
| 169 | 17x |
c_sbp_cont * log(1.0647) + |
| 170 | 17x |
c_oxygen_cont * log(1.00064) + |
| 171 | 17x |
c_ecog_cont * log(0.7857) + |
| 172 | 17x |
c_neutrophil_lymphocyte_ratio_cont * log(1.0758) + |
| 173 | 17x |
c_bmi_cont * log(1.061) + |
| 174 | 17x |
c_ast_alt_ratio_cont * log(1.0756) + |
| 175 | 17x |
c_stage_initial_dx_cont * log(1.2177) + |
| 176 | 17x |
c_time_dx_to_index * log(1.0001) |
| 177 |
) |
|
| 178 |
) |
|
| 179 |
) |
|
| 180 | ||
| 181 |
#summary(cohort$treat) |
|
| 182 | ||
| 183 |
# simulate HTE by race |
|
| 184 | ||
| 185 |
## Asian patients |
|
| 186 | 17x |
cohort_Asian <- cohort |> |
| 187 | 17x |
dplyr::filter(dem_race == "Asian") |
| 188 | ||
| 189 |
# assign betas for hazard model |
|
| 190 | 17x |
betas_os_Asian <- c( |
| 191 | 17x |
treat = log(1), |
| 192 | 17x |
dem_sex_cont = log(0.79), |
| 193 | 17x |
dem_age_index_cont = log(1.02), |
| 194 | 17x |
c_smoking_history = log(.70), |
| 195 | 17x |
c_ecog_cont = log(.7) |
| 196 |
) |
|
| 197 | ||
| 198 | 17x |
set.seed(seed) |
| 199 | 17x |
cohort_outcome_Asian <- cohort_Asian |> |
| 200 | 17x |
dplyr::bind_cols( |
| 201 | 17x |
simsurv::simsurv( |
| 202 | 17x |
dist = "weibull", |
| 203 | 17x |
gammas = 1.2, # shape parameter |
| 204 | 17x |
lambdas = 0.01, # scale parameter |
| 205 | 17x |
betas = betas_os_Asian, # betas for the model |
| 206 | 17x |
x = cohort_Asian, # data frame with covariates |
| 207 | 17x |
maxt = 120 |
| 208 |
) |
|
| 209 |
) |> |
|
| 210 | 17x |
dplyr::select(-id) |> |
| 211 | 17x |
dplyr::rename( |
| 212 | 17x |
fu_itt_months = eventtime, |
| 213 | 17x |
death_itt = status |
| 214 |
) |
|
| 215 | ||
| 216 |
## Non-Asian patients |
|
| 217 | 17x |
cohort_nonAsian <- cohort |> |
| 218 | 17x |
dplyr::filter(dem_race != "Asian") |
| 219 | ||
| 220 | 17x |
betas_os_nonAsian <- c( |
| 221 | 17x |
treat = log(0.54), |
| 222 | 17x |
dem_sex_cont = log(0.79), |
| 223 | 17x |
dem_age_index_cont = log(1.02), |
| 224 | 17x |
c_smoking_history = log(.70), |
| 225 | 17x |
c_ecog_cont = log(.7) |
| 226 |
) |
|
| 227 | ||
| 228 | 17x |
set.seed(seed) |
| 229 | 17x |
cohort_outcome_nonAsian <- cohort_nonAsian |> |
| 230 | 17x |
dplyr::bind_cols( |
| 231 | 17x |
simsurv::simsurv( |
| 232 | 17x |
dist = "weibull", |
| 233 | 17x |
gammas = 1.2, |
| 234 | 17x |
lambdas = 0.01, |
| 235 | 17x |
betas = betas_os_nonAsian, |
| 236 | 17x |
x = cohort_nonAsian, |
| 237 | 17x |
maxt = 120 |
| 238 |
) |
|
| 239 |
) |> |
|
| 240 | 17x |
dplyr::select(-id) |> |
| 241 | 17x |
dplyr::rename( |
| 242 | 17x |
fu_itt_months = eventtime, |
| 243 | 17x |
death_itt = status |
| 244 |
) |
|
| 245 | ||
| 246 |
# combine Asian and non-Asian |
|
| 247 | 17x |
cohort_outcome <- rbind(cohort_outcome_Asian, cohort_outcome_nonAsian) |
| 248 | ||
| 249 |
# summary(cohort_outcome$death_itt) |
|
| 250 |
# summary(cohort_outcome$fu_itt_months) |
|
| 251 |
# hist(cohort_outcome$fu_itt_months) |
|
| 252 | ||
| 253 | 1x |
if(!include_id) cohort_outcome <- cohort_outcome |> dplyr::select(-patientid) |
| 254 | ||
| 255 | 17x |
if(imposeNA){
|
| 256 | ||
| 257 | 9x |
varComplete <- c("patientid", "treat", "c_year_index", "fu_itt_months", "death_itt", "dem_age_index_cont", "dem_sex_cont")
|
| 258 | 9x |
varNA <- setdiff(names(cohort_outcome), varComplete) |
| 259 | ||
| 260 |
# define column numbers for pattern and weight determination |
|
| 261 | 9x |
colsNA <- which(colnames(cohort_outcome) %in% varNA) |
| 262 | ||
| 263 |
# define missingness pattern |
|
| 264 | 9x |
default_pattern <- rep(1, ncol(cohort_outcome)) |
| 265 | 9x |
pattern <- replace(default_pattern, colsNA, 0) |
| 266 | ||
| 267 | 9x |
set.seed(seed) |
| 268 | 9x |
cohort_outcome <- mice::ampute( |
| 269 | 9x |
data = cohort_outcome, |
| 270 | 9x |
prop = propNA, |
| 271 | 9x |
patterns = pattern, |
| 272 | 9x |
mech = "MCAR" |
| 273 | 9x |
)$amp |
| 274 | ||
| 275 |
} |
|
| 276 | ||
| 277 | 17x |
return(cohort_outcome) |
| 278 | ||
| 279 |
} |
|
| 280 |
| 1 |
#' Estimate raking weights for a mimids or wimids object |
|
| 2 |
#' |
|
| 3 |
#' @description |
|
| 4 |
#' Function estimates raking weights for multiple imputed and matched (\code{mimids})
|
|
| 5 |
#' or weighted (\code{wimids}) datasets. That is to match the distributions of
|
|
| 6 |
#' certain variables in the imputed and matched/weighted datasets to a distribution of |
|
| 7 |
#' a target population (e.g., clinical trial population). |
|
| 8 |
#' |
|
| 9 |
#' @details |
|
| 10 |
#' The function requires an object of class mimids or wimids (\code{x}), which is the output
|
|
| 11 |
#' of a workflow that requires imputing multiple (m) datasets using mice or amelia |
|
| 12 |
#' and matching or weighting each imputed dataset via the MatchThem package |
|
| 13 |
#' (see examples). |
|
| 14 |
#' |
|
| 15 |
#' The function additionally requires a list of target distributions (\code{targets}) for each variable
|
|
| 16 |
#' that is considered for the raking procedure. The list should contain named vectors with the target |
|
| 17 |
#' distributions for each variable and the names of the vectors should match the variable names in the imputed datasets. |
|
| 18 |
#' |
|
| 19 |
#' In brief, the raking procedure iteratively adjusts the weights to make the weighted sample percentages match the target |
|
| 20 |
#' population percentages for the selected variables.It does this by multiplying the current weight for each case by a |
|
| 21 |
#' factor based on the ratio of the target population proportion to the weighted sample proportion for a given category. |
|
| 22 |
#' This adjustment is performed sequentially for each category of each selected variable. Because adjusting for one |
|
| 23 |
#' variable can disrupt the match for previous variables, the process is repeated through all selected variables in cycles. |
|
| 24 |
#' This iterative process minimizes the Kullback-Leibler (KL) divergence and continues until the weighted sample proportions |
|
| 25 |
#' match the target population proportions for all categories ("full convergence"), or until no further change occurs.
|
|
| 26 |
#' |
|
| 27 |
#' The function follows the following logic: |
|
| 28 |
#' 1. Extract the ith imputed dataset from the mimids or wimids object |
|
| 29 |
#' 2. Create a temporary case/patient ID |
|
| 30 |
#' 3. Apply the \code{\link[anesrake]{anesrake}} function to the ith imputed dataset
|
|
| 31 |
#' 4. Create a temporary dataframe with the case ID and replace the initial weights with the updated raking weights |
|
| 32 |
#' 5. Merge the temporary dataframe with the ith imputed dataset |
|
| 33 |
#' 6. Drop the temporary case ID |
|
| 34 |
#' 7. Return the ith imputed dataset with the raking weights |
|
| 35 |
#' |
|
| 36 |
#' The function returns a list of data frames with the updated raking weights. These |
|
| 37 |
#' updated raking weights overwrite in each data frame the existing \code{weights} column.
|
|
| 38 |
#' This column can then be used in a downstream analysis (e.g., Kaplan-Meier, Cox proportional hazards regression). |
|
| 39 |
#' |
|
| 40 |
#' @seealso |
|
| 41 |
#' \code{\link[anesrake]{anesrake}} \code{\link[MatchThem]{matchthem}} \code{\link[MatchThem]{weightthem}}
|
|
| 42 |
#' |
|
| 43 |
#' @param x imputed and matched (mimids) or weighted (wimids) object |
|
| 44 |
#' @param targets list of all target values for the raking procedure |
|
| 45 |
#' |
|
| 46 |
#' @return list of data frames with updated raking weights (\code{raking_weights})
|
|
| 47 |
#' |
|
| 48 |
#' @md |
|
| 49 |
#' |
|
| 50 |
#' @export |
|
| 51 |
#' |
|
| 52 |
#' @examples |
|
| 53 |
#' |
|
| 54 |
#' library(encore.analytics) |
|
| 55 |
#' library(mice) |
|
| 56 |
#' library(dplyr) |
|
| 57 |
#' library(MatchThem) |
|
| 58 |
#' library(survival) |
|
| 59 |
#' |
|
| 60 |
#' # simulate a cohort with 1,000 patients with 20% missing data |
|
| 61 |
#' data <- simulate_data( |
|
| 62 |
#' n = 500, |
|
| 63 |
#' imposeNA = TRUE, |
|
| 64 |
#' propNA = 0.2 |
|
| 65 |
#' ) |> |
|
| 66 |
#' # anesrake works best with factor variables |
|
| 67 |
#' mutate(c_smoking_history = factor(ifelse(c_smoking_history == TRUE, "Current/former", "Never"))) |
|
| 68 |
#' |
|
| 69 |
#' # impute the data (create mids object) |
|
| 70 |
#' set.seed(42) |
|
| 71 |
#' mids <- mice(data, m = 5, print = FALSE) |
|
| 72 |
#' |
|
| 73 |
#' # define covariates for propensity score model |
|
| 74 |
#' covariates <- data |> |
|
| 75 |
#' select(starts_with("c_"), starts_with("dem_")) |>
|
|
| 76 |
#' colnames() |
|
| 77 |
#' |
|
| 78 |
#' # define propensity score model |
|
| 79 |
#' fit <- as.formula(paste("treat ~", paste(covariates, collapse = " + ")))
|
|
| 80 |
#' |
|
| 81 |
#' # match patients within each imputed dataset |
|
| 82 |
#' mimids <- matchthem( |
|
| 83 |
#' formula = fit, |
|
| 84 |
#' datasets = mids, |
|
| 85 |
#' approach = 'within', |
|
| 86 |
#' method = 'nearest' |
|
| 87 |
#' ) |
|
| 88 |
#' |
|
| 89 |
#' smoker_target <- c(.35, .65) |
|
| 90 |
#' names(smoker_target) <- c("Current/former", "Never")
|
|
| 91 |
#' |
|
| 92 |
#' # summarize target distributions in a named list vector |
|
| 93 |
#' targets <- list(smoker_target) |
|
| 94 |
#' names(targets) <- c("c_smoking_history")
|
|
| 95 |
#' |
|
| 96 |
#' # estimate raking weights |
|
| 97 |
#' mirwds <- raking_weights( |
|
| 98 |
#' x = mimids, |
|
| 99 |
#' targets = targets |
|
| 100 |
#' ) |
|
| 101 |
#' |
|
| 102 |
raking_weights <- function(x, |
|
| 103 |
targets = NULL |
|
| 104 |
){
|
|
| 105 | ||
| 106 |
# input checks---- |
|
| 107 | ||
| 108 |
# check if object is a mimids or wimids object |
|
| 109 | 6x |
assertthat::assert_that(inherits(x, c("mimids", "wimids")), msg = "<object> needs to be a mimids or wimids object")
|
| 110 |
# check if targets is a list |
|
| 111 | 5x |
assertthat::assert_that(is.list(targets), msg = "<targets> needs to be a list") |
| 112 |
# check if targets is a named list |
|
| 113 | 3x |
assertthat::assert_that(all(names(targets) != ""), msg = "<targets> needs to be a named list") |
| 114 |
# check if targets is a list of named vectors |
|
| 115 | 3x |
assertthat::assert_that(all(sapply(targets, is.numeric)), msg = "<targets> needs to be a list of named vectors") |
| 116 |
# check if targets is a list of named vectors |
|
| 117 | 2x |
assertthat::assert_that(all(sapply(targets, function(x) all(names(x) != ""))), msg = "<targets> needs to be a list of named vectors") |
| 118 | ||
| 119 |
# extract imputed and matched/weighted datasets and weights ---- |
|
| 120 | ||
| 121 |
# create a function to indicate which cases should be re-weighted |
|
| 122 |
# since we are only interested in matched/weighted cases |
|
| 123 | 2x |
estimate_raking_weights <- function(i){
|
| 124 | ||
| 125 |
# next, we create a list of datasets to apply the raking procedure one-by-one |
|
| 126 |
# action = "all" : produces a mild object (list) of the multiply imputed datasets |
|
| 127 |
# all = FALSE : do NOT include observations with a zero estimated weight |
|
| 128 |
# include = FALSE : do NOT include original data with the missing values |
|
| 129 | ||
| 130 |
# Important: the all = parameter needs to be set to FALSE |
|
| 131 |
# since otherwise unmatched patients or those zero weight |
|
| 132 |
# will be included, too! |
|
| 133 | 10x |
data <- MatchThem::complete(x, action = i, all = FALSE, include = FALSE) |
| 134 | ||
| 135 |
# create a temporary case ID |
|
| 136 | 10x |
data <- data |> |
| 137 | 10x |
dplyr::mutate(caseid = seq(1, nrow(data), 1)) |
| 138 | ||
| 139 |
# apply anesrake function to re-weight ----------------------------------- |
|
| 140 | 10x |
anesrake_out <- anesrake::anesrake( |
| 141 |
# target distributions for raking procedure |
|
| 142 | 10x |
inputter = targets, |
| 143 |
# data to be weighted |
|
| 144 | 10x |
dataframe = data, |
| 145 |
# patient identifier |
|
| 146 | 10x |
caseid = data$caseid, |
| 147 |
# other weights that should be accounted for before re-weighting is conducted |
|
| 148 |
# for matching this is 0 (unmatched) and 1 (matched) and for weighting this is the actual weights |
|
| 149 | 10x |
weightvec = data$weights, |
| 150 |
# no verbosity |
|
| 151 | 10x |
verbose = FALSE |
| 152 |
) |
|
| 153 | ||
| 154 |
# create a temporary dataframe with id and sampling/re-weighting weights |
|
| 155 | 10x |
caseweights <- data.frame(caseid = anesrake_out$caseid, weights = anesrake_out$weightvec) |
| 156 | ||
| 157 | 10x |
data_return <- data |> |
| 158 |
# remove "old" weights with new weights |
|
| 159 | 10x |
dplyr::select(-weights) |> |
| 160 | 10x |
dplyr::left_join(caseweights, by = "caseid") |> |
| 161 |
# now we can drop the temporary id |
|
| 162 | 10x |
dplyr::select(-caseid) |
| 163 | ||
| 164 | 10x |
return(data_return) |
| 165 | ||
| 166 |
} |
|
| 167 | ||
| 168 |
# return list of imputed, matched/weighted and re-weighted datasets (mirwds) |
|
| 169 | 2x |
mirwds <- lapply( |
| 170 | 2x |
X = 1:length(x$models), |
| 171 | 2x |
FUN = estimate_raking_weights |
| 172 |
) |
|
| 173 | ||
| 174 | 2x |
return(mirwds) |
| 175 | ||
| 176 |
} |
| 1 |
#' Manually fit and pool Cox proportional hazards model results from multiple imputed datasets |
|
| 2 |
#' |
|
| 3 |
#' @description |
|
| 4 |
#' This function manually fits and pools results from Cox proportional hazards models using |
|
| 5 |
#' a list of imputed datasets. It leverages the `mice::as.mira` and `mice::pool` functions to |
|
| 6 |
#' ensure proper pooling of results across multiple imputations using Rubin's rules. |
|
| 7 |
#' |
|
| 8 |
#' @details |
|
| 9 |
#' The function requires a list of imputed data frames with weights and optional cluster (matching) information, |
|
| 10 |
#' as well as a formula for the Cox proportional hazards model. The data frames must include a column named |
|
| 11 |
#' \code{weights}, and optionally a column named \code{subclass} (for matched datasets to indicate cluster membership).
|
|
| 12 |
#' |
|
| 13 |
#' This function is particularly useful when working with imputed datasets that are not in the form of |
|
| 14 |
#' \code{mimids} or \code{wimids} objects, such as when intermediate steps like raking weights
|
|
| 15 |
#' (via \code{raking_weights}) are applied. It provides a flexible way to fit and pool Cox models
|
|
| 16 |
#' while ensuring compatibility with Rubin's rules for multiple imputation. |
|
| 17 |
#' |
|
| 18 |
#' The function follows these steps: |
|
| 19 |
#' 1. Fit a Cox proportional hazards model to each imputed dataset. If a \code{subclass} column is present,
|
|
| 20 |
#' it is used as a cluster variable for matched pairs. |
|
| 21 |
#' 2. Convert the list of fitted models into a \code{mira} object using \code{mice::as.mira}.
|
|
| 22 |
#' 3. Pool the results using \code{mice::pool}, which applies Rubin's rules for combining estimates
|
|
| 23 |
#' and variances across imputations. |
|
| 24 |
#' 4. Format the pooled results, including exponentiating the hazard ratios and calculating confidence intervals. |
|
| 25 |
#' |
|
| 26 |
#' @seealso |
|
| 27 |
#' \code{\link[survival]{coxph}}, \code{\link[mice]{pool}}, \code{\link[mice]{as.mira}},
|
|
| 28 |
#' \code{\link[MatchThem]{matchthem}}, \code{\link[MatchThem]{weightthem}}
|
|
| 29 |
#' |
|
| 30 |
#' @param x A list of imputed datasets with weights or raking weights (e.g., from \code{\link[encore.analytics]{raking_weights}})
|
|
| 31 |
#' and optional cluster information (for matched datasets). |
|
| 32 |
#' @param surv_formula A formula for the Cox proportional hazards model (default is \code{Surv(fu_itt_months, death_itt) ~ treat}).
|
|
| 33 |
#' |
|
| 34 |
#' @return A data frame containing the pooled results, including hazard ratios, confidence intervals, and p-values. |
|
| 35 |
#' |
|
| 36 |
#' @md |
|
| 37 |
#' |
|
| 38 |
#' @export |
|
| 39 |
#' |
|
| 40 |
#' @examples |
|
| 41 |
#' library(encore.analytics) |
|
| 42 |
#' library(mice) |
|
| 43 |
#' library(MatchThem) |
|
| 44 |
#' |
|
| 45 |
#' # Simulate a cohort with 500 patients and 20% missing data |
|
| 46 |
#' data <- simulate_data( |
|
| 47 |
#' n = 500, |
|
| 48 |
#' imposeNA = TRUE, |
|
| 49 |
#' propNA = 0.2 |
|
| 50 |
#' ) |
|
| 51 |
#' |
|
| 52 |
#' # Impute the data |
|
| 53 |
#' set.seed(42) |
|
| 54 |
#' mids <- mice(data, m = 5, print = FALSE) |
|
| 55 |
#' |
|
| 56 |
#' # Fit a propensity score model |
|
| 57 |
#' fit <- as.formula(treat ~ dem_age_index_cont + dem_sex_cont + c_smoking_history) |
|
| 58 |
#' |
|
| 59 |
#' # Weight patients within each imputed dataset |
|
| 60 |
#' wimids <- weightthem( |
|
| 61 |
#' formula = fit, |
|
| 62 |
#' datasets = mids, |
|
| 63 |
#' approach = "within", |
|
| 64 |
#' method = "glm", |
|
| 65 |
#' estimand = "ATO" |
|
| 66 |
#' ) |
|
| 67 |
#' |
|
| 68 |
#' # Create a list of imputed and weighted datasets |
|
| 69 |
#' wimids_list <- MatchThem::complete(wimids, action = "all", all = FALSE, include = FALSE) |
|
| 70 |
#' |
|
| 71 |
#' # Define a survival model formula |
|
| 72 |
#' cox_fit <- as.formula(survival::Surv(fu_itt_months, death_itt) ~ treat) |
|
| 73 |
#' |
|
| 74 |
#' # Fit and pool Cox proportional hazards model results |
|
| 75 |
#' cox_pooling(wimids_list, surv_formula = cox_fit) |
|
| 76 |
cox_pooling <- function(x, |
|
| 77 |
surv_formula = stats::as.formula(survival::Surv(fu_itt_months, death_itt) ~ treat) |
|
| 78 |
){
|
|
| 79 | ||
| 80 |
# input checks---- |
|
| 81 | ||
| 82 |
# check if object is a list |
|
| 83 | 9x |
assertthat::assert_that(inherits(x, "list") | inherits(x, "mild"), msg = "<cox_results_list> needs to be a list or mild object") |
| 84 |
# check if surv_formula is a formula |
|
| 85 | 8x |
assertthat::assert_that(inherits(surv_formula, "formula"), msg = "<surv_formula> needs to be a formula") |
| 86 |
# check if weights_col is in the data |
|
| 87 | 7x |
df_colnames <- colnames(x[[1]]) |
| 88 | 7x |
assertthat::assert_that("weights" %in% df_colnames, msg = "<weights> not in <x>")
|
| 89 | ||
| 90 |
# Step 1: fit Cox model to each imputed dataset---- |
|
| 91 | 6x |
cox_fit <- function(i){
|
| 92 | ||
| 93 |
# if cluster_col is in the data (matched datasets) |
|
| 94 | 30x |
if("subclass" %in% df_colnames){
|
| 95 | ||
| 96 | 10x |
cox <- survival::coxph( |
| 97 | 10x |
data = x[[i]], |
| 98 | 10x |
formula = surv_formula, |
| 99 | 10x |
weights = weights, |
| 100 | 10x |
cluster = subclass, |
| 101 | 10x |
robust = TRUE |
| 102 |
) |
|
| 103 | ||
| 104 |
}else{
|
|
| 105 | ||
| 106 | 20x |
cox <- survival::coxph( |
| 107 | 20x |
data = x[[i]], |
| 108 | 20x |
formula = surv_formula, |
| 109 | 20x |
weights = weights, |
| 110 | 20x |
robust = TRUE |
| 111 |
) |
|
| 112 | ||
| 113 |
} |
|
| 114 | ||
| 115 | 30x |
return(cox) |
| 116 | ||
| 117 | ||
| 118 |
} |
|
| 119 | ||
| 120 |
# loop cox_fit function over each imputed dataset |
|
| 121 | 6x |
cox_results_list <- lapply( |
| 122 | 6x |
X = 1:length(x), |
| 123 | 6x |
FUN = cox_fit |
| 124 |
) |
|
| 125 | ||
| 126 |
# Step 2: extract results into a format suitable for pooling |
|
| 127 |
# convert each model into a "mira" object (required for pooling) |
|
| 128 | 6x |
mira_object <- mice::as.mira(cox_results_list) |
| 129 | ||
| 130 |
# Step 3: pool the results using Rubin's rules |
|
| 131 | 6x |
pooled_results <- mice::pool(mira_object) |
| 132 | ||
| 133 |
# Step 4: format the pooled results |
|
| 134 | 6x |
cox_result <- broom::tidy(pooled_results, exponentiate = TRUE, conf.int = TRUE) |
| 135 | ||
| 136 | 6x |
return(cox_result) |
| 137 | ||
| 138 | ||
| 139 |
} |
| 1 |
#' Wrapper around \code{gtsummary::tbl_summary()} to create a beautiful Table 1 quickly
|
|
| 2 |
#' |
|
| 3 |
#' @description |
|
| 4 |
#' Builds a table 1 using the \code{gtsummary} package. The function takes a one-row-per-patient dataframe
|
|
| 5 |
#' and creates a summary table of covariates stratified by treatment group. |
|
| 6 |
#' |
|
| 7 |
#' @details |
|
| 8 |
#' Wrapper for \code{\link[gtsummary]{tbl_summary}}.
|
|
| 9 |
#' |
|
| 10 |
#' The function \code{create_table1} is a wrapper around the \code{gtsummary::tbl_summary()} function.
|
|
| 11 |
#' It is designed to create a summary table of covariates stratified by treatment group. |
|
| 12 |
#' The function requires a one-row-per-patient dataframe and creates a summary table of covariates stratified by treatment group. |
|
| 13 |
#' |
|
| 14 |
#' @importFrom smd smd |
|
| 15 |
#' |
|
| 16 |
#' @param x dataframe with individual-level patient data in a one-row-per-patient format with treatment stratification variable and covariates to be displayed in the Table 1 |
|
| 17 |
#' @param covariates character vector of columns/covariate names to be displayed in Table 1 |
|
| 18 |
#' @param covariates_labels named character vector or list of formulas specifying variables labels of covariate-label pairs to display in table |
|
| 19 |
#' @param treat character specifying column name of treatment variable |
|
| 20 |
#' @param explicit_na_categorical logical, should missings in categorical variables be explicitly included as a separate category (default is TRUE) |
|
| 21 |
#' |
|
| 22 |
#' @return object of class "tbl_summary" "gtsummary" |
|
| 23 |
#' |
|
| 24 |
#' @export |
|
| 25 |
#' |
|
| 26 |
#' @examples |
|
| 27 |
#' \dontrun{
|
|
| 28 |
#' library(encore.analytics) |
|
| 29 |
#' |
|
| 30 |
#' # simulate a cohort with 1,000 patients with 20% missing data |
|
| 31 |
#' data <- simulate_data( |
|
| 32 |
#' n = 1000, |
|
| 33 |
#' imposeNA = TRUE, |
|
| 34 |
#' propNA = 0.2 |
|
| 35 |
#' ) |
|
| 36 |
#' |
|
| 37 |
#' # create a Table 1 |
|
| 38 |
#' create_table1( |
|
| 39 |
#' x = data, |
|
| 40 |
#' covariates = c("dem_age_index_cont", "dem_sex_cont", "c_smoking_history"),
|
|
| 41 |
#' covariates_labels = list( |
|
| 42 |
#' "dem_age_index_cont" = "Age", |
|
| 43 |
#' "dem_sex_cont" = "Sex", |
|
| 44 |
#' "c_smoking_history" = "Smoking history" |
|
| 45 |
#' ), |
|
| 46 |
#' treat = "treat" |
|
| 47 |
#' ) |
|
| 48 |
#'} |
|
| 49 |
create_table1 <- function(x = NULL, |
|
| 50 |
covariates = NULL, |
|
| 51 |
covariates_labels = NULL, |
|
| 52 |
treat = "treat", |
|
| 53 |
explicit_na_categorical = TRUE |
|
| 54 |
){
|
|
| 55 | ||
| 56 | 8x |
install_on_demand("cardx")
|
| 57 | 8x |
install_on_demand("smd")
|
| 58 | ||
| 59 |
# check input |
|
| 60 | 8x |
assertthat::assert_that(any(class(x) %in% c("data.frame", "tibble")), msg = "<x> is not a data.frame or tibble")
|
| 61 | 7x |
assertthat::assert_that(all(covariates %in% colnames(x)), msg = "some or all of <covariates> not in <x>") |
| 62 | 6x |
assertthat::assert_that(is.list(covariates_labels) | is.null(covariates_labels), msg = "<covariates_labels> needs to be a named character vector or list of formulas specifying variables labels") |
| 63 | 5x |
assertthat::assert_that(length(covariates_labels) == length(covariates) | is.null(covariates_labels), msg = "<covariates_labels> must be a named character vector or list of formulas specifying variables labels") |
| 64 | 5x |
assertthat::assert_that(length(treat) == 1, msg = "<treat> must be a single character string") |
| 65 | 5x |
assertthat::assert_that(treat %in% colnames(x), msg = "<treat> not in <x>") |
| 66 | 4x |
assertthat::assert_that(is.logical(explicit_na_categorical), msg = "<explicit_na_categorical> must be a logical TRUE or FALSE") |
| 67 | 3x |
assertthat::assert_that(length(covariates) > 0, msg = "<covariates> must be a non-empty character vector") |
| 68 | ||
| 69 |
# assign treatment variable |
|
| 70 |
# and subset columns |
|
| 71 | 3x |
x[["strata"]] <- x[[treat]] |
| 72 | 3x |
x <- x[c(covariates, "strata")] |
| 73 | ||
| 74 | 3x |
if(explicit_na_categorical){
|
| 75 | ||
| 76 |
# encode NA explicitly as "Missing" (only applicable to categorical) |
|
| 77 | 3x |
x <- x |> |
| 78 |
# logicals into categorical |
|
| 79 | 3x |
dplyr::mutate( |
| 80 | 3x |
dplyr::across( |
| 81 | 3x |
dplyr::where(function(.x) is.logical(.x)), |
| 82 | 3x |
.fns = function(.x) factor(.x, levels = c("TRUE", "FALSE", "Missing"))
|
| 83 |
) |
|
| 84 |
) |> |
|
| 85 |
# for all categorical, assign a "Missing" category |
|
| 86 | 3x |
dplyr::mutate( |
| 87 | 3x |
dplyr::across( |
| 88 | 3x |
dplyr::where(function(.x) is.character(.x) | is.factor(.x)), |
| 89 | 3x |
.fns = function(.x) forcats::fct_na_value_to_level(.x, level = "Missing") |
| 90 |
) |
|
| 91 |
) |> |
|
| 92 |
# drop "Missing" level for covariates which are fully observed/have no NA |
|
| 93 | 3x |
droplevels() |
| 94 |
} |
|
| 95 | ||
| 96 | 3x |
table1 <- x |> |
| 97 | 3x |
gtsummary::tbl_summary( |
| 98 | 3x |
by = "strata", |
| 99 | 3x |
missing = "ifany", |
| 100 | 3x |
missing_text = "Missing", |
| 101 | 3x |
label = covariates_labels |
| 102 |
) |> |
|
| 103 | 3x |
gtsummary::add_difference(test = dplyr::everything() ~ "smd") |> |
| 104 | 3x |
gtsummary::modify_column_hide(conf.low) |> |
| 105 | 3x |
gtsummary::add_overall() |> |
| 106 | 3x |
gtsummary::modify_header( |
| 107 | 3x |
label ~ "**Patient characteristic**", |
| 108 | 3x |
stat_0 ~ "**Total** <br> N = {N}",
|
| 109 | 3x |
stat_1 ~ "**{level}** <br> N = {n} <br> ({style_percent(p, digits=1)}%)",
|
| 110 | 3x |
stat_2 ~ "**{level}** <br> N = {n} <br> ({style_percent(p, digits=1)}%)"#,
|
| 111 |
#test_result ~ "**SMD**" |
|
| 112 |
) |> |
|
| 113 | 3x |
gtsummary::modify_spanning_header(c("stat_1", "stat_2") ~ "**Treatment received**")
|
| 114 | ||
| 115 | 3x |
return(table1) |
| 116 | ||
| 117 |
} |
| 1 |
#' Calculate Agreement Metrics Between RCT and RWE Results |
|
| 2 |
#' |
|
| 3 |
#' @description Creates a formatted table comparing results from randomized controlled trials (RCTs) |
|
| 4 |
#' with real-world evidence (RWE) emulation studies by calculating agreement metrics. |
|
| 5 |
#' |
|
| 6 |
#' @details |
|
| 7 |
#' Calculates three types of agreement metrics: |
|
| 8 |
#' - Statistical significance agreement: Same direction and statistical significance |
|
| 9 |
#' - Estimate agreement: RWE estimate within RCT confidence interval |
|
| 10 |
#' - SMD agreement: Standardized mean difference below threshold (default 1.96) |
|
| 11 |
#' |
|
| 12 |
#' All estimates should be hazard ratios (HR) and must be positive. |
|
| 13 |
#' Estimates are log-transformed for SMD calculation. |
|
| 14 |
#' |
|
| 15 |
#' @importFrom tidyr pivot_wider pivot_longer |
|
| 16 |
#' @importFrom smd smd |
|
| 17 |
#' |
|
| 18 |
#' @param x A data.frame or tibble containing RCT and RWE results (in wide format, see examples). |
|
| 19 |
#' Must contain these columns: |
|
| 20 |
#' |
|
| 21 |
#' - rct_estimate = Point estimate from the RCT |
|
| 22 |
#' - rct_lower = Lower confidence limit of the RCT result |
|
| 23 |
#' - rct_upper = Upper confidence limit of the RCT result |
|
| 24 |
#' - rwe_estimate = Point estimate from the RWE emulation study |
|
| 25 |
#' - rwe_lower = Lower confidence limit of the RWE result |
|
| 26 |
#' - rwe_upper = Upper confidence limit of the RWE result |
|
| 27 |
#' |
|
| 28 |
#' @param analysis_col Character. Name of column identifying different analyses |
|
| 29 |
#' @param group_col Character. Optional. Name of column for grouping results (e.g., database) |
|
| 30 |
#' @param estimate_label Character. Label for estimates in table header. Default: "HR (95% CI)" |
|
| 31 |
#' @param smd_threshold Numeric. Threshold for SMD agreement. Default: 1.96 (α=0.05) |
|
| 32 |
#' |
|
| 33 |
#' @return A gt table object with formatted agreement metrics |
|
| 34 |
#' |
|
| 35 |
#' @md |
|
| 36 |
#' |
|
| 37 |
#' @examples |
|
| 38 |
#' # Example 1: One analysis per database with grouping |
|
| 39 |
#' |
|
| 40 |
#'x <- tibble::tribble( |
|
| 41 |
#'~Analysis, ~Database, ~rct_estimate, ~rct_lower, ~rct_upper, ~rwe_estimate, ~rwe_lower, ~rwe_upper, |
|
| 42 |
#'"Main analysis", "Database 1", 0.87, 0.78, 0.97, 0.82, 0.76, 0.87, |
|
| 43 |
#'"Main analysis", "Database 2", 0.5, 0.4, 0.6, 2.0, 1.8, 2.2, |
|
| 44 |
#'"Main analysis", "Database 3", 0.8, 0.7, 0.9, 1.5, 1.4, 1.6 |
|
| 45 |
#') |
|
| 46 |
#' |
|
| 47 |
#' agreement_metrics(x, analysis_col = "Analysis", group_col = "Database") |
|
| 48 |
#' |
|
| 49 |
#' # Example 2: Three analyses, one database without grouping |
|
| 50 |
#' |
|
| 51 |
#' x <- tibble::tribble( |
|
| 52 |
#' ~Analysis, ~rct_estimate, ~rct_lower, ~rct_upper, ~rwe_estimate, ~rwe_lower, ~rwe_upper, |
|
| 53 |
#' "Main analysis", 0.87, 0.78, 0.97, 0.82, 0.76, 0.87, |
|
| 54 |
#' "Sensitivity analysis 1", 0.5, 0.4, 0.6, 2.0, 1.8, 2.2, |
|
| 55 |
#' "Sensitivity analysis 2", 0.8, 0.7, 0.9, 1.5, 1.4, 1.6 |
|
| 56 |
#' ) |
|
| 57 |
#' |
|
| 58 |
#' agreement_metrics(x, analysis_col = "Analysis") |
|
| 59 |
#' |
|
| 60 |
#' # Example 3: Three analyses per database with grouping |
|
| 61 |
#' x <- tibble::tribble( |
|
| 62 |
#' ~Analysis, ~Database, ~rct_estimate, ~rct_lower, ~rct_upper, ~rwe_estimate, ~rwe_lower, ~rwe_upper, |
|
| 63 |
#' "Main analysis", "Database 1", 0.87, 0.78, 0.97, 0.82, 0.76, 0.87, |
|
| 64 |
#' "Main analysis", "Database 2", 0.5, 0.4, 0.6, 2.0, 1.8, 2.2, |
|
| 65 |
#' "Main analysis", "Database 3", 0.8, 0.7, 0.9, 1.5, 1.4, 1.6, |
|
| 66 |
#' "Sensitivity analysis 1", "Database 1", 0.5, 0.4, 0.6, 2.0, 1.8, 2.2, |
|
| 67 |
#' "Sensitivity analysis 1", "Database 2", 0.8, 0.7, 0.9, 1.5, 1.4, 1.6, |
|
| 68 |
#' "Sensitivity analysis 1", "Database 3", 0.87, 0.78, 0.97, 0.82, 0.76, 0.87, |
|
| 69 |
#' "Sensitivity analysis 2", "Database 1", 0.87, 0.78, 0.97, 0.82, 0.76, 0.87, |
|
| 70 |
#' "Sensitivity analysis 2", "Database 2", 0.5, 0.4, 0.6, 2.0, 1.8, 2.2, |
|
| 71 |
#' "Sensitivity analysis 2", "Database 3", 0.8, 0.7, 0.9, 1.5, 1.4, 1.6 |
|
| 72 |
#' ) |
|
| 73 |
#' |
|
| 74 |
#' agreement_metrics(x, analysis_col = "Analysis", group_col = "Database") |
|
| 75 |
#' |
|
| 76 |
#' @export |
|
| 77 |
agreement_metrics <- function(x, |
|
| 78 |
analysis_col, |
|
| 79 |
group_col = NULL, # subgroup, e.g, database |
|
| 80 |
estimate_label = "HR (95% CI)", |
|
| 81 |
smd_threshold = 1.96 # customizable threshold for SMD agreement |
|
| 82 |
){
|
|
| 83 | ||
| 84 |
# input checks |
|
| 85 | 10x |
assertthat::assert_that(any(class(x) %in% c("data.frame", "tibble")), msg = "<x> is not a data.frame or tibble")
|
| 86 | 10x |
assertthat::assert_that(all(c("rct_estimate", "rct_lower", "rct_upper", "rwe_estimate", "rwe_lower", "rwe_upper") %in% colnames(x)), msg = "<x> does not contain all required columns")
|
| 87 | 9x |
assertthat::assert_that(!is.null(analysis_col), msg = "<analysis_col> cannot be NULL") |
| 88 | 9x |
assertthat::assert_that(analysis_col %in% colnames(x), msg = sprintf("<analysis_col> '%s' not found in <x>", analysis_col))
|
| 89 | 1x |
if(!is.null(group_col)) assertthat::assert_that(group_col %in% colnames(x), msg = "<group_col> not in <x>") |
| 90 | ||
| 91 |
# check for non-positive values |
|
| 92 | 8x |
assertthat::assert_that(all(x$rct_estimate > 0), msg = "RCT estimates must be positive for hazard ratios") |
| 93 | 7x |
assertthat::assert_that(all(x$rwe_estimate > 0), msg = "RWE estimates must be positive for hazard ratios") |
| 94 | ||
| 95 |
# if no smd_value present in the results table, calculate it |
|
| 96 | 7x |
if(!"smd_value" %in% colnames(x)){
|
| 97 | 7x |
x <- x |> |
| 98 | 7x |
dplyr::rowwise() |> |
| 99 | 7x |
dplyr::mutate( |
| 100 | 7x |
smd_value = tryCatch({
|
| 101 | 10x |
smd_agreement( |
| 102 | 10x |
rct_estimate = log(rct_estimate), |
| 103 | 10x |
rct_lower = log(rct_lower), |
| 104 | 10x |
rct_upper = log(rct_upper), |
| 105 | 10x |
rwe_estimate = log(rwe_estimate), |
| 106 | 10x |
rwe_lower = log(rwe_lower), |
| 107 | 10x |
rwe_upper = log(rwe_upper) |
| 108 |
) |
|
| 109 | 7x |
}, error = function(e) {
|
| 110 | ! |
warning(sprintf("SMD calculation failed: %s", e$message))
|
| 111 | 7x |
NA_real_ |
| 112 |
}) |
|
| 113 |
) |> |
|
| 114 | 7x |
dplyr::ungroup() |
| 115 |
} |
|
| 116 | ||
| 117 |
# calculate agreement metrics |
|
| 118 | 7x |
x <- x |> |
| 119 | 7x |
dplyr::mutate( |
| 120 |
# significance agreement with additional categories |
|
| 121 | 7x |
significance_agreement = dplyr::case_when( |
| 122 |
# superiority |
|
| 123 | 7x |
rct_estimate < 1 & rct_upper < 1 & rwe_estimate < 1 & rwe_upper < 1 ~ "Yes", |
| 124 |
# non-inferiority |
|
| 125 | 7x |
rct_estimate < 1 & rct_upper >= 1 & rwe_estimate < 1 & rwe_upper >= 1 ~ "Yes", |
| 126 |
# inferiority |
|
| 127 | 7x |
rct_estimate >= 1 & rct_lower >= 1 & rwe_estimate >= 1 & rwe_upper >= 1 ~ "Yes", |
| 128 |
# all other cases |
|
| 129 | 7x |
TRUE ~ "No" |
| 130 |
), |
|
| 131 | ||
| 132 |
# estimate agreement |
|
| 133 | 7x |
estimate_agreement = dplyr::if_else( |
| 134 | 7x |
rwe_estimate >= rct_lower & rwe_estimate <= rct_upper, "Yes", "No" |
| 135 |
), |
|
| 136 | ||
| 137 |
# smd agreement using parameter |
|
| 138 | 7x |
smd_agreement = dplyr::case_when( |
| 139 | 7x |
is.na(smd_value) ~ "NA", |
| 140 | 7x |
abs(smd_value) < smd_threshold ~ "Yes", |
| 141 | 7x |
TRUE ~ "No" |
| 142 |
) |
|
| 143 |
) |
|
| 144 | ||
| 145 |
# format table for display |
|
| 146 | 7x |
x_format <- x |> |
| 147 | 7x |
dplyr::mutate(dplyr::across(where(is.numeric), ~ format(.x, digits = 2, nsmall = 2))) |> |
| 148 | 7x |
dplyr::mutate( |
| 149 | 7x |
RCT = glue::glue("{rct_estimate} ({rct_lower} - {rct_upper})"),
|
| 150 | 7x |
RWE = glue::glue("{rwe_estimate} ({rwe_lower} - {rwe_upper})"),
|
| 151 | 7x |
smd_agreement = glue::glue("{smd_agreement} ({smd_value})")
|
| 152 |
) |> |
|
| 153 | 7x |
dplyr::select(dplyr::all_of(c(analysis_col, group_col)), "RCT", "RWE", "significance_agreement", "estimate_agreement", "smd_agreement") |
| 154 | ||
| 155 |
# conditional grouping |
|
| 156 | 7x |
if (!is.null(group_col)) {
|
| 157 | 1x |
x_format <- x_format |> |
| 158 | 1x |
dplyr::group_by(dplyr::across(dplyr::all_of(group_col))) |
| 159 |
} |
|
| 160 | ||
| 161 |
# create and format gt table |
|
| 162 | 7x |
x_gt <- x_format |> |
| 163 | 7x |
gt::gt() |> |
| 164 | 7x |
gt::tab_spanner( |
| 165 | 7x |
label = estimate_label, |
| 166 | 7x |
columns = c("RCT", "RWE")
|
| 167 |
) |> |
|
| 168 | 7x |
gt::cols_label( |
| 169 | 7x |
significance_agreement = gt::md("Statistical <br> significance <br> agreement"),
|
| 170 | 7x |
estimate_agreement = gt::md("Estimate <br> agreement"),
|
| 171 | 7x |
smd_agreement = "SMD" |
| 172 |
) |
|
| 173 | ||
| 174 |
# apply styles for agreement columns |
|
| 175 | 7x |
for (col in c("significance_agreement", "estimate_agreement", "smd_agreement")) {
|
| 176 | 21x |
x_gt <- x_gt |> |
| 177 | 21x |
gt::tab_style( |
| 178 | 21x |
style = gt::cell_text(color = "darkgreen"), |
| 179 | 21x |
locations = gt::cells_body( |
| 180 | 21x |
columns = !!col, |
| 181 | 21x |
rows = if (col == "smd_agreement") {
|
| 182 | 7x |
stringr::str_detect(x_format[[col]], "Yes") |
| 183 |
} else {
|
|
| 184 | 14x |
x_format[[col]] == "Yes" |
| 185 |
} |
|
| 186 |
) |
|
| 187 |
) |> |
|
| 188 | 21x |
gt::tab_style( |
| 189 | 21x |
style = gt::cell_text(color = "darkred"), |
| 190 | 21x |
locations = gt::cells_body( |
| 191 | 21x |
columns = !!col, |
| 192 | 21x |
rows = if (col == "smd_agreement") {
|
| 193 | 7x |
stringr::str_detect(x_format[[col]], "No") |
| 194 |
} else {
|
|
| 195 | 14x |
x_format[[col]] == "No" |
| 196 |
} |
|
| 197 |
) |
|
| 198 |
) |
|
| 199 |
} |
|
| 200 | ||
| 201 |
# footnote estimate label |
|
| 202 | 7x |
if(estimate_label == "HR (95% CI)") footnote_label <- "HR = Hazard ratio" |
| 203 | ||
| 204 |
# Add bold styling for headers |
|
| 205 | 7x |
x_gt <- x_gt |> |
| 206 | 7x |
gt::tab_style( |
| 207 | 7x |
style = gt::cell_text(weight = "bold"), |
| 208 | 7x |
locations = gt::cells_column_labels() |
| 209 |
) |> |
|
| 210 | 7x |
gt::tab_style( |
| 211 | 7x |
style = gt::cell_text(weight = "bold"), |
| 212 | 7x |
locations = gt::cells_column_spanners() |
| 213 |
) |> |
|
| 214 | 7x |
gt::tab_style( |
| 215 | 7x |
style = gt::cell_text(weight = "bold"), |
| 216 | 7x |
locations = gt::cells_row_groups() |
| 217 |
) |> |
|
| 218 | 7x |
gt::tab_footnote( |
| 219 | 7x |
glue::glue("Abbreviations: CI = Confidence interval, {footnote_label}, RCT = Randomized controlled trial, RWE = Real-world evidence, SMD = standardized mean difference (based on log hazard ratios)")
|
| 220 |
) |
|
| 221 | ||
| 222 | 7x |
return(x_gt) |
| 223 |
} |
| 1 |
#' Pooled Kaplan-Meier estimate and survival curve |
|
| 2 |
#' |
|
| 3 |
#' @description |
|
| 4 |
#' Computes pooled median survival Kaplan-Meier estimates using Rubin's rule |
|
| 5 |
#' and outputs corresponding Kaplan-Meier curve across imputed and |
|
| 6 |
#' matched/weighted datasets |
|
| 7 |
#' |
|
| 8 |
#' @details |
|
| 9 |
#' The function requires an object of class mimids or wimids (\code{x}), which is the output
|
|
| 10 |
#' of a workflow that requires imputing multiple (m) datasets using mice or amelia |
|
| 11 |
#' and matching or weighting each imputed dataset via the MatchThem package |
|
| 12 |
#' (see examples). |
|
| 13 |
#' |
|
| 14 |
#' The function fits the pre-specified survfit model (\code{surv_formula}, \code{\link[survival]{survfit}} package)
|
|
| 15 |
#' to compute survival probabilities at each individual time point according to the Kaplan-Meier method. |
|
| 16 |
#' For matched and weighted datasets, weights, cluster membership (matching only) and robust |
|
| 17 |
#' variance estimates are considered in the \code{\link[survival]{survfit}} call by default.
|
|
| 18 |
#' |
|
| 19 |
#' Since survival probabilities typically don't follow normal distributions, |
|
| 20 |
#' these need to be transformed to approximate normality first before pooling |
|
| 21 |
#' across imputed datasets and time points. To that end, survival probabilities are first |
|
| 22 |
#' transformed using a complementary log-log transformation (\code{log(-log(1-pr(surv)))})
|
|
| 23 |
#' as recommended by multiple sources (Marshall, Billingham, and Bryan (2009)). |
|
| 24 |
#' |
|
| 25 |
#' To pool the transformed estimates across imputed datasets and time points, the pool.scalar |
|
| 26 |
#' function is used to apply Rubin's rule to combine pooled estimates (qbar) according to |
|
| 27 |
#' formula (3.1.2) Rubin (1987) and to compute the corresponding total variance (t) of the pooled |
|
| 28 |
#' estimate according to formula (3.1.5) Rubin (1987). |
|
| 29 |
#' |
|
| 30 |
#' The pooled survival probabilities are then back-transformed via \code{1-exp(-exp(qbar))}
|
|
| 31 |
#' for pooled survival probability estimates and \code{1-exp(-exp(qbar +/- 1.96*sqrt(t)))}
|
|
| 32 |
#' for lower and upper 95% confidence intervals. As the formula indicates, the pooled standard |
|
| 33 |
#' error is computed as the square root of the total variance. The vertically stacked table |
|
| 34 |
#' with transformed and backtransformed estimates is returned with the |
|
| 35 |
#' \code{km_survival_table} table.
|
|
| 36 |
#' |
|
| 37 |
#' Finally, the median survival time is extracted from the \code{km_survival_table} table
|
|
| 38 |
#' by determining the time the survival probability drops below .5 for the first time. |
|
| 39 |
#' For this a sub-function of Terry M. Therneau's \code{\link[survival]{print.survfit}} function
|
|
| 40 |
#' is used. Therneau also considers some edge cases/nuisances (x = time, y = surv): |
|
| 41 |
#' - Nuisance 1: if one of the y's is exactly .5, we want the mean of the corresponding x and the first x for which y<.5. We need to use the equivalent of all.equal to check for a .5 however: survfit(Surv(1:100)~1) gives a value of .5 + 1.1e-16 due to roundoff error. |
|
| 42 |
#' - Nuisance 2: there may by an NA in the y's |
|
| 43 |
#' - Nuisance 3: if no y's are <=.5, then we should return NA |
|
| 44 |
#' - Nuisance 4: the obs (or many) after the .5 may be censored, giving a stretch of values = .5 +- epsilon |
|
| 45 |
#' |
|
| 46 |
#' The function follows the following logic: |
|
| 47 |
#' 1. Fit Kaplan-Meier survival function to each imputed and matched/weighted dataset |
|
| 48 |
#' 2. Transform survival probabilities using complementary log-log transformation |
|
| 49 |
#' 3. Pool transformed survival probabilities and compute total variance using Rubin's rule |
|
| 50 |
#' 4. Back-transform pooled survival probabilities and compute 95% confidence intervals |
|
| 51 |
#' 5. Extract median survival time and corresponding 95% confidence intervals |
|
| 52 |
#' 6. Plot Kaplan-Meier curve with pooled survival probabilities and confidence intervals |
|
| 53 |
#' |
|
| 54 |
#' More references: |
|
| 55 |
#' - https://stefvanbuuren.name/fimd/sec-pooling.html |
|
| 56 |
#' - https://link.springer.com/article/10.1007/s10198-008-0129-y |
|
| 57 |
#' - https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-015-0048-4 |
|
| 58 |
#' |
|
| 59 |
#' @seealso |
|
| 60 |
#' \code{\link[survival]{survfit}} \code{\link[mice]{pool.scalar}} \code{\link[MatchThem]{matchthem}} \code{\link[MatchThem]{weightthem}}
|
|
| 61 |
#' |
|
| 62 |
#' @param x imputed and matched (mimids) or weighted (wimids) object |
|
| 63 |
#' @param surv_formula specification of survival model formula to be fitted |
|
| 64 |
#' |
|
| 65 |
#' @return list with pooled median survival estimate and pooled Kaplan-Meier curve |
|
| 66 |
#' \code{km_median_survival}:
|
|
| 67 |
#' - strata = stratum |
|
| 68 |
#' - t_median = median survival time |
|
| 69 |
#' - t_lower = lower 95% CI of median survival time |
|
| 70 |
#' - t_upper = upper 95% CI of median survival time |
|
| 71 |
#' |
|
| 72 |
#' \code{km_survival_table}:
|
|
| 73 |
#' - strata = stratum |
|
| 74 |
#' - time = observed time point |
|
| 75 |
#' - m = number of imputed datasets |
|
| 76 |
#' - qbar = pooled univariate estimate of the complementary log-log transformed survival probabilities, see formula (3.1.2) Rubin (1987) |
|
| 77 |
#' - t = total variance of the pooled univariate estimate of the complementary log-log transformed survival probabilities, formula (3.1.5) Rubin (1987) |
|
| 78 |
#' - se = total standard error of the pooled estimate (derived as sqrt(t)) |
|
| 79 |
#' - surv = back-transformed pooled survival probability |
|
| 80 |
#' - lower = Wald-type lower 95% confidence interval of back-transformed pooled survival probability |
|
| 81 |
#' - upper = Wald-type upper 95% confidence interval of back-transformed pooled survival probability |
|
| 82 |
#' |
|
| 83 |
#' \code{km_plot}: ggplot2 object with Kaplan-Meier curve
|
|
| 84 |
#' |
|
| 85 |
#' @md |
|
| 86 |
#' |
|
| 87 |
#' @export |
|
| 88 |
#' |
|
| 89 |
#' @examples |
|
| 90 |
#' library(encore.analytics) |
|
| 91 |
#' library(mice) |
|
| 92 |
#' library(MatchThem) |
|
| 93 |
#' |
|
| 94 |
#' # simulate a cohort with 1,000 patients with 20% missing data |
|
| 95 |
#' data <- simulate_data( |
|
| 96 |
#' n = 500, |
|
| 97 |
#' imposeNA = TRUE, |
|
| 98 |
#' propNA = 0.2 |
|
| 99 |
#' ) |
|
| 100 |
#' |
|
| 101 |
#' # impute the data |
|
| 102 |
#' set.seed(42) |
|
| 103 |
#' mids <- mice(data, m = 5, print = FALSE) |
|
| 104 |
#' |
|
| 105 |
#' # fit a propensity score model |
|
| 106 |
#' fit <- as.formula(treat ~ dem_age_index_cont + dem_sex_cont + c_smoking_history) |
|
| 107 |
#' |
|
| 108 |
#' # weight (or alternatively match) patients within each imputed dataset |
|
| 109 |
#' wimids <- weightthem( |
|
| 110 |
#' formula = fit, |
|
| 111 |
#' datasets = mids, |
|
| 112 |
#' approach = "within", |
|
| 113 |
#' method = "glm", |
|
| 114 |
#' estimand = "ATO" |
|
| 115 |
#' ) |
|
| 116 |
#' |
|
| 117 |
#' # fit a survival model |
|
| 118 |
#' km_fit <- as.formula(survival::Surv(fu_itt_months, death_itt) ~ treat) |
|
| 119 |
#' |
|
| 120 |
#' # estimate and pool median survival times and Kaplan-Meier curve |
|
| 121 |
#' km_out <- km_pooling( |
|
| 122 |
#' x = wimids, |
|
| 123 |
#' surv_formula = km_fit |
|
| 124 |
#' ) |
|
| 125 |
#' |
|
| 126 |
#' # median survival time |
|
| 127 |
#' km_out$km_median_survival |
|
| 128 |
#' |
|
| 129 |
#' # KM curve |
|
| 130 |
#' km_out$km_plot |
|
| 131 |
#' |
|
| 132 |
km_pooling <- function(x = NULL, |
|
| 133 |
surv_formula = stats::as.formula(survival::Surv(fu_itt_months, death_itt) ~ treat) |
|
| 134 |
){
|
|
| 135 | ||
| 136 |
# input checks |
|
| 137 |
# check if x is a mimids or wimids object |
|
| 138 | 3x |
assertthat::assert_that(inherits(x, c("mimids", "wimids")), msg = "<x> needs to be a mimids or wimids object")
|
| 139 |
# check if surv_formula is a formula |
|
| 140 | 2x |
assertthat::assert_that(inherits(surv_formula, "formula"), msg = "<surv_formula> needs to be a formula") |
| 141 |
# check weights and subclass |
|
| 142 | 1x |
assertthat::assert_that("weights" %in% names(MatchThem::complete(x)), msg = "<x> needs to contain a weights column")
|
| 143 |
# check if weights and subclass are in the data |
|
| 144 | 1x |
if(inherits(x, "mimids")){
|
| 145 | ! |
assertthat::assert_that("subclass" %in% names(MatchThem::complete(x)), msg = "<x> needs to contain a weights column")
|
| 146 |
} |
|
| 147 | ||
| 148 |
# Fit Kaplan Meier -------------------------------------------------------- |
|
| 149 | ||
| 150 |
# first, we fit a survival function for ith dataset |
|
| 151 | 1x |
compute_km <- function(i){
|
| 152 | ||
| 153 |
# compute the survival function for dataset i |
|
| 154 | ||
| 155 |
## for weighting |
|
| 156 | 5x |
if(inherits(x, "wimids")){
|
| 157 | ||
| 158 | 5x |
survfit_fit <- survival::survfit( |
| 159 | 5x |
formula = surv_formula, |
| 160 | 5x |
weights = weights, |
| 161 | 5x |
robust = T, |
| 162 | 5x |
data = i |
| 163 |
) |
|
| 164 | ||
| 165 |
} |
|
| 166 | ||
| 167 |
## for matching |
|
| 168 | 5x |
if(inherits(x, "mimids")){
|
| 169 | ||
| 170 | ! |
survfit_fit <- survival::survfit( |
| 171 | ! |
formula = surv_formula, |
| 172 | ! |
weights = weights, |
| 173 | ! |
cluster = subclass, |
| 174 | ! |
robust = T, |
| 175 | ! |
data = i |
| 176 |
) |
|
| 177 | ||
| 178 |
} |
|
| 179 | ||
| 180 | 5x |
return(survfit_fit) |
| 181 | ||
| 182 |
} |
|
| 183 | ||
| 184 | ||
| 185 |
# compute KM across all imputed (and weighted/matched) datasets -------------------- |
|
| 186 | ||
| 187 |
# next, we create a list of datasets to apply the compute_km function to |
|
| 188 |
# action = "all" : produces a mild object (list) of the multiply imputed datasets |
|
| 189 |
# all = FALSE : do NOT include observations with a zero estimated weight |
|
| 190 |
# include = FALSE : do NOT include original data with the missing values |
|
| 191 | ||
| 192 |
# Important: the all = parameter needs to be set to FALSE |
|
| 193 |
# since otherwise unmatched patients or those zero weight |
|
| 194 |
# will be included, too! |
|
| 195 | 1x |
object_list <- MatchThem::complete(x, action = "all", all = FALSE, include = FALSE) |
| 196 | ||
| 197 |
# now apply the compute_km function across all imputed and matched/weighted datasets |
|
| 198 |
# this creates a list with computed survival probabilities |
|
| 199 | 1x |
survival_fit_list <- lapply(object_list, FUN = compute_km) |
| 200 | ||
| 201 | ||
| 202 |
# pool survival probabilities --------------------------------------------- |
|
| 203 | ||
| 204 |
# now to combine/pool quantities with non-normal distributions |
|
| 205 |
# such as survival probabilities, Stef van Buuren |
|
| 206 |
# recommends to tansform survival probabilities |
|
| 207 |
# using a complementary log-log transformation |
|
| 208 |
# https://stefvanbuuren.name/fimd/sec-pooling.html |
|
| 209 |
# reference: Marshall, Billingham, and Bryan (2009) https://link.springer.com/article/10.1007/s10198-008-0129-y |
|
| 210 |
# reference: https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-015-0048-4 |
|
| 211 | ||
| 212 |
# that is, the function cloglog performs the |
|
| 213 |
# transformations on all survival probabilities |
|
| 214 |
# stored in survival_fit_list |
|
| 215 | 1x |
cloglog <- function(i){
|
| 216 | ||
| 217 |
# convert to data.frame |
|
| 218 | 5x |
survival_times_df <- with( |
| 219 | 5x |
summary(i), |
| 220 | 5x |
data.frame(strata, time, surv, std.err, lower, upper) |
| 221 |
) |> |
|
| 222 | 5x |
dplyr::mutate( |
| 223 |
# compute transformed qbar |
|
| 224 | 5x |
q = log(-log(1-surv)), |
| 225 |
# compute transformed U |
|
| 226 | 5x |
u = std.err^2 / ((1-surv) * log(1-surv))^2 |
| 227 |
) |
|
| 228 | ||
| 229 |
} |
|
| 230 | ||
| 231 |
# we apply this function to all fitted survival curves |
|
| 232 | 1x |
km_survival <- lapply(survival_fit_list, FUN = cloglog) |> |
| 233 | ||
| 234 |
# the resulting list with q and u per stratum and dataset |
|
| 235 |
# is then vertically stacked to a long data frame (using bind_rows). |
|
| 236 | 1x |
dplyr::bind_rows() |> |
| 237 | ||
| 238 |
# group by the exposure levels (stratum) and apply the pool.scalar function |
|
| 239 |
# which pools univariate estimates of m datasets. This returns: |
|
| 240 |
# m = The number of imputations |
|
| 241 |
# qbar = The pooled univariate estimate, formula (3.1.2) Rubin (1987) |
|
| 242 |
# t = The total variance of the pooled estimated, formula (3.1.5) Rubin (1987). |
|
| 243 | 1x |
dplyr::group_by(strata, time) |> |
| 244 | 1x |
dplyr::summarize( |
| 245 | 1x |
data.frame(mice::pool.scalar(Q = q, U = u, k = 1)[c("m", "qbar", "t")]),
|
| 246 | 1x |
.groups = "drop" |
| 247 |
) |> |
|
| 248 | ||
| 249 |
# we back-transform back and compute CI's |
|
| 250 | 1x |
dplyr::mutate( |
| 251 | 1x |
surv = 1-exp(-exp(qbar)), |
| 252 | 1x |
se = sqrt(t), |
| 253 | 1x |
lower = 1-exp(-exp(qbar - 1.96*se)), |
| 254 | 1x |
upper = 1-exp(-exp(qbar + 1.96*se)) |
| 255 |
) |> |
|
| 256 | ||
| 257 |
# arrange by strata and descending (!) survival probability |
|
| 258 | 1x |
dplyr::arrange(strata, desc(surv)) |
| 259 | ||
| 260 | ||
| 261 |
# Plot a pooled Kaplan-Meier curve ---------------------------------------- |
|
| 262 | ||
| 263 |
# plot pooled Kaplan-Meier curve based on pooled survival probabilities |
|
| 264 | ||
| 265 | 1x |
km_plot <- km_survival |> |
| 266 | 1x |
ggplot2::ggplot(ggplot2::aes(x = time, y = surv, color = strata, fill = strata)) + |
| 267 | 1x |
ggplot2::geom_line(linewidth = 1.25) + |
| 268 | 1x |
ggplot2::geom_ribbon(ggplot2::aes(ymin = lower, ymax = upper), linetype = 0, alpha = 0.2) + |
| 269 | 1x |
ggplot2::theme_minimal(base_size = 14) + |
| 270 | 1x |
ggplot2::labs( |
| 271 | 1x |
x = "Follow-up time", |
| 272 | 1x |
y = "Pr(Survival) [%]", |
| 273 | 1x |
color = "Exposure", |
| 274 | 1x |
fill = "Exposure" |
| 275 |
) + |
|
| 276 | 1x |
ggplot2::theme(legend.position = "top") |
| 277 | ||
| 278 | ||
| 279 |
# Extract median survival time -------------------------------------------- |
|
| 280 | ||
| 281 |
# This section is taken from Terry M. Therneau's |
|
| 282 |
# print.survfit function in the survival package! |
|
| 283 | ||
| 284 |
# to extract the median survival times, we |
|
| 285 |
# apply the same sub-functions used in |
|
| 286 |
# print.survfit function of the original survival package |
|
| 287 |
# which defines the median survival time as the time |
|
| 288 |
# the survival probability drops below .5 for the first time |
|
| 289 |
# see https://github.com/cran/survival/blob/master/R/print.survfit.R#L109 |
|
| 290 | ||
| 291 |
# Therneau defined some edge cases (x = time, y = surv): |
|
| 292 |
# Nuisance 1: if one of the y's is exactly .5, we want the mean of |
|
| 293 |
# the corresponding x and the first x for which y<.5. We need to |
|
| 294 |
# use the equivalent of all.equal to check for a .5 however: |
|
| 295 |
# survfit(Surv(1:100)~1) gives a value of .5 + 1.1e-16 due to |
|
| 296 |
# roundoff error. |
|
| 297 |
# Nuisance 2: there may by an NA in the y's |
|
| 298 |
# Nuisance 3: if no y's are <=.5, then we should return NA |
|
| 299 |
# Nuisance 4: the obs (or many) after the .5 may be censored, giving |
|
| 300 |
# a stretch of values = .5 +- epsilon |
|
| 301 | 1x |
minmin <- function(y, x){
|
| 302 | 6x |
tolerance <- .Machine$double.eps^.5 #same as used in all.equal() |
| 303 | 6x |
keep <- (!is.na(y) & y <(.5 + tolerance)) |
| 304 | 6x |
if (!any(keep)) NA |
| 305 |
else {
|
|
| 306 | 6x |
x <- x[keep] |
| 307 | 6x |
y <- y[keep] |
| 308 | 6x |
if (abs(y[1]-.5) <tolerance && any(y< y[1])) |
| 309 | ! |
(x[1] + x[min(which(y<y[1]))])/2 |
| 310 | 6x |
else x[1] |
| 311 |
} |
|
| 312 |
} |
|
| 313 | ||
| 314 |
# apply this across strata listed in the km_survival table |
|
| 315 | 1x |
minmin_strata <- function(i){
|
| 316 | ||
| 317 | 2x |
km_survival_stratum <- km_survival |> |
| 318 | 2x |
dplyr::filter(strata == i) |
| 319 | ||
| 320 |
# median survival time |
|
| 321 | 2x |
med <- minmin(y = km_survival_stratum$surv, km_survival_stratum$time) |
| 322 | ||
| 323 |
# 95% CI of median survival time |
|
| 324 |
# = time when lower and upper |
|
| 325 |
# 95% confidence intervals of the survival |
|
| 326 |
# probability drop below .5 |
|
| 327 | 2x |
if(!is.null(km_survival_stratum$upper)){
|
| 328 | 2x |
upper <- minmin(y = km_survival_stratum$upper, x = km_survival_stratum$time) |
| 329 | 2x |
lower <- minmin(y = km_survival_stratum$lower, x = km_survival_stratum$time) |
| 330 | 2x |
minmin_strata_return <- c(med, lower, upper) |
| 331 | ||
| 332 |
}else{
|
|
| 333 | ||
| 334 |
# Therneau assigns a 0 of no confidence intervals are available |
|
| 335 | ! |
upper <- 0 |
| 336 | ! |
lower <- 0 |
| 337 | ||
| 338 |
} |
|
| 339 | ||
| 340 |
# provide output table |
|
| 341 | 2x |
minmin_strata_return_tibble <- tibble::tibble( |
| 342 | 2x |
strata = i, # stratum |
| 343 | 2x |
t_median = med, # median survival time |
| 344 | 2x |
t_lower = lower, # lower 95% CI of median survival time |
| 345 | 2x |
t_upper = upper # upper 95% CI of median survival time |
| 346 |
) |
|
| 347 | ||
| 348 |
} |
|
| 349 | ||
| 350 | 1x |
km_median_survival <- do.call(rbind, lapply(X = unique(km_survival$strata), FUN = minmin_strata)) |
| 351 | ||
| 352 | ||
| 353 |
# return results ---------------------------------------------------------- |
|
| 354 | ||
| 355 | 1x |
return( |
| 356 | 1x |
list( |
| 357 | 1x |
km_median_survival = km_median_survival, |
| 358 | 1x |
km_plot = km_plot, |
| 359 | 1x |
km_survival_table = km_survival |
| 360 |
) |
|
| 361 |
) |
|
| 362 | ||
| 363 | ||
| 364 |
} |
| 1 |
#' Install package on demand |
|
| 2 |
#' |
|
| 3 |
#' @description |
|
| 4 |
#' This function checks if a package is installed. If not, it |
|
| 5 |
#' asks the user if they want to install it. If the user |
|
| 6 |
#' agrees, it installs the package from CRAN. |
|
| 7 |
#' |
|
| 8 |
#' @param pkg a character string with the name of the package |
|
| 9 |
#' @param quiet logical. If TRUE, suppresses messages during |
|
| 10 |
#' @param ... additional arguments passed to \code{install.packages}
|
|
| 11 |
#' |
|
| 12 |
install_on_demand <- function(pkg, quiet = FALSE, ...) {
|
|
| 13 |
# internal function that checks whether package pkg is |
|
| 14 |
# in the library. If not found, it asks the user permission |
|
| 15 |
# to install. |
|
| 16 | 16x |
if (requireNamespace(pkg, quietly = TRUE)) {
|
| 17 | 16x |
return() |
| 18 |
} |
|
| 19 | ! |
if (interactive()) {
|
| 20 | ! |
answer <- utils::askYesNo(paste("Package", pkg, "needed. Install from CRAN?"))
|
| 21 | ! |
if (answer) utils::install.packages(pkg, quiet = quiet) |
| 22 |
} |
|
| 23 |
} |