encore.analytics coverage - 95.99%

Files
Source

#' Calculate Standardized Mean Difference (SMD) Between RCT and RWE Results
#'
#' @description
#' This function calculates the standardized mean difference (SMD) between results from
#' a randomized controlled trial (RCT) and its real-world evidence (RWE) emulation.
#' The SMD provides a measure of agreement between the two results, taking into account
#' both the point estimates and their corresponding uncertainty.
#'
#' @details
#' The SMD is calculated as the difference between RCT and RWE estimates, divided by
#' the square root of the sum of their variances. Variances are derived from the
#' confidence intervals assuming normal distribution (using 1.96 for 95% CI).
#'
#' The formula used is:
#' SMD = (θ_RCT - θ_RWE) / sqrt(Var(θ_RCT) + Var(θ_RWE))
#'
#' @param rct_estimate Numeric. Point estimate from the RCT (typically log hazard ratio or log odds ratio)
#' @param rct_lower Numeric. Lower bound of the confidence interval from RCT
#' @param rct_upper Numeric. Upper bound of the confidence interval from RCT
#' @param rwe_estimate Numeric. Point estimate from the RWE study
#' @param rwe_lower Numeric. Lower bound of the confidence interval from RWE
#' @param rwe_upper Numeric. Upper bound of the confidence interval from RWE
#'
#' @return
#' Returns a numeric value representing the standardized mean difference.
#' Larger absolute values indicate greater disagreement between RCT and RWE results.
#'
#' @examples
#' # Example with hazard ratios (log scale)
#' smd <- smd_agreement(
#'   rct_estimate = log(0.87),
#'   rct_lower = log(0.78),
#'   rct_upper = log(0.97),
#'   rwe_estimate = log(0.82),
#'   rwe_lower = log(0.76),
#'   rwe_upper = log(0.87)
#'   )
#'
#' @references
#' For methodology on comparing RCT and RWE results using standardized differences.
#'
#' @export
smd_agreement <- function(rct_estimate,
                          rct_lower,
                          rct_upper,
                          rwe_estimate,
                          rwe_lower,
                          rwe_upper
                          ){

  # Input validation
  required_args <- c("rct_estimate", "rct_lower", "rct_upper",
                    "rwe_estimate", "rwe_lower", "rwe_upper")

  # Check for missing values
  missing_args <- required_args[sapply(list(rct_estimate, rct_lower, rct_upper,
                                          rwe_estimate, rwe_lower, rwe_upper),
                                     is.null)]
  if (length(missing_args) > 0) {
    stop(sprintf("Missing required arguments: %s",
                paste(missing_args, collapse = ", ")))
  }

  # Check numeric inputs
  if (!all(sapply(list(rct_estimate, rct_lower, rct_upper,
                      rwe_estimate, rwe_lower, rwe_upper), is.numeric))) {
    stop("All inputs must be numeric values")
  }

  # Check bounds
  if (rct_lower >= rct_upper) {
    stop("RCT lower bound must be less than upper bound")
  }
  if (rwe_lower >= rwe_upper) {
    stop("RWE lower bound must be less than upper bound")
  }

  # Check if estimate is within bounds
  if (rct_estimate < rct_lower || rct_estimate > rct_upper) {
    warning("RCT estimate is outside its confidence interval bounds")
  }
  if (rwe_estimate < rwe_lower || rwe_estimate > rwe_upper) {
    warning("RWE estimate is outside its confidence interval bounds")
  }

  # numerator
  numerator <- (rct_estimate - rwe_estimate)

  # variances
  var_rct <- (rct_upper - rct_lower)/(2*1.96)
  var_rwe <- (rwe_upper - rwe_lower)/(2*1.96)

  # denominator
  denominator <- sqrt(var_rct^2 + var_rwe^2)

  # Final check for valid calculation
  if (is.infinite(denominator) || is.nan(denominator)) {
    stop("Invalid calculation: denominator is infinite or NaN")
  }

  # smd
  smd <- numerator/denominator

  return(smd)

}

#' Simulates an artifical EHR-derived analysis-ready oncology dataset
#'
#' @description
#' Parameterized function to quickly create an
#' EHR-derived analytic cohort for analytic code development.
#'
#' @details
#' This function simulates a cohort of patients with
#' oncology data. The cohort is simulated using a Weibull distribution for time to event
#' and a logistic distribution for treatment assignment.
#' The function also allows for missingness to be imposed on the data.
#' The function is parameterized to allow for the number of patients
#' to be simulated, the seed for reproducibility, and whether to
#' include a patient id variable.
#'
#' @param n_total integer, number of total patients
#' @param seed integer, seed for reproducibility
#' @param include_id logical, include a generated patientid variable
#' @param imposeNA logical, set covariates to missing
#' @param propNA numeric, proportion of missingness, needs to be between 0 and 1
#'
#' @return data frame with simulated analytic cohort
#'
#' @export
#'
#' @examples
#' \dontrun{
#' library(encore.analytics)
#'
#' data_miss <- simulate_data(
#'   n_total = 3500,
#'   seed = 41,
#'   include_id = FALSE,
#'   imposeNA = TRUE,
#'   propNA = .33
#'   )
#'
#' head(data_miss)
#'
#'}

simulate_data <- function(n_total = 3500,
                          seed = 42,
                          include_id = TRUE,
                          imposeNA = TRUE,
                          propNA = NULL
                          ){

  # check input
  assertthat::assert_that(inherits(n_total, "numeric"), msg = "<n_total> needs to be an integer")
  assertthat::assert_that(n_total > 10, msg = "<n_total> needs to be larger than 10 for meaningful output")
  assertthat::assert_that(inherits(seed, "numeric"), msg = "<seed> needs to be an integer")
  assertthat::assert_that(inherits(include_id, "logical"), msg = "<seed> needs to be a logical")
  assertthat::assert_that(inherits(imposeNA, "logical"), msg = "<imposeNA> needs to be a logical")
  if(!is.null(propNA)) assertthat::assert_that(inherits(propNA, "numeric"), msg = "<propNA> needs to be numeric")
  if(!is.null(propNA)) assertthat::assert_that(propNA >= 0 & propNA <= 1, msg = "<propNA> needs to be a numeric between 0 and 1")
  if(!is.null(propNA)) assertthat::assert_that(imposeNA == TRUE, msg = "<propNA> is specified but imposeNA is FALSE")

  # set the seed
  set.seed(seed)

  # Build the total cohort
  cohort <- tibble::tibble(
    # patient id
    patientid = paste0("Patient", seq(1, n_total, 1)),
    # age
    dem_age_index_cont = stats::rnorm(n = n_total, mean = 69, sd = 8),
    # sex/male
    dem_sex_cont = stats::rbinom(n = n_total, size = 1, prob = .33),
    # smoking
    c_smoking_history = sample(c(TRUE, FALSE), size = n_total, replace = TRUE, prob = c(.44, .56)),
    # num met sites
    c_number_met_sites = sample(c(1, 2, 3, 4, 5), size = n_total, replace = TRUE, prob = c(.73, .22, .035, .006, .001)),
    # c_ecog_cont
    c_ecog_cont = stats::rbinom(n = n_total, size = 1, prob = .57),
    # c_stage_initial_dx_cont
    c_stage_initial_dx_cont = sample(c(4, 3, 2, 1), size = n_total, replace = TRUE, prob = c(.76, 0.2, 0.028, 0.012)),
    # dem_race
    dem_race = factor(sample(c("Asian", "White", "Other"), size = n_total, replace = TRUE, prob = c(.36, .62, .02))),
    # dem_region
    dem_region = factor(sample(c("Midwest", "Northeast", "South", "West"), size = n_total, replace = TRUE, prob = c(.14, .20, .39, .27))),
    # dem_ses
    dem_ses = sample(c(1, 2, 3, 4, 5), size = n_total, replace = TRUE, prob = c(.12, .15, .19, .24, .29)),
    # c_hemoglobin_g_dl_cont
    c_hemoglobin_g_dl_cont = stats::rnorm(n = n_total, mean = 12.90, sd = 1.3),
    # c_urea_nitrogen_mg_dl_cont
    c_urea_nitrogen_mg_dl_cont = stats::rnorm(n = n_total, mean = 2.71, sd = 0.3),
    # c_platelets_10_9_l_cont
    c_platelets_10_9_l_cont = stats::rnorm(n = n_total, mean = 259, sd = 53),
    # c_calcium_mg_dl_cont
    c_calcium_mg_dl_cont = stats::rnorm(n = n_total, mean = 2.23, sd = 0.04),
    # c_glucose_mg_dl_cont
    c_glucose_mg_dl_cont = stats::rnorm(n = n_total, mean = 4.64, sd = 0.11),
    # c_lymphocyte_leukocyte_ratio_cont
    c_lymphocyte_leukocyte_ratio_cont = stats::rnorm(n = n_total, mean = 2.87, sd = 0.17),
    # c_alp_u_l_cont
    c_alp_u_l_cont = stats::rnorm(n = n_total, mean = 4.48, sd = 0.13),
    # c_protein_g_l_cont
    c_protein_g_l_cont = stats::rnorm(n = n_total, mean = 68, sd = 4),
    # c_alt_u_l_cont
    c_alt_u_l_cont = stats::rnorm(n = n_total, mean = 2.89, sd = 0.33),
    # c_albumin_g_l_cont
    c_albumin_g_l_cont = stats::rnorm(n = n_total, mean = 40, sd = 3),
    # c_bilirubin_mg_dl_cont
    c_bilirubin_mg_dl_cont = stats::rnorm(n = n_total, mean = -0.92, sd = 1.28),
    # c_chloride_mmol_l_cont
    c_chloride_mmol_l_cont = stats::rnorm(n = n_total, mean = 102.0, sd = 3),
    # c_monocytes_10_9_l_cont
    c_monocytes_10_9_l_cont = stats::rnorm(n = n_total, mean = -0.51, sd = 0.25),
    # c_eosinophils_leukocytes_ratio_cont
    c_eosinophils_leukocytes_ratio_cont = stats::rnorm(n = n_total, mean = 0.69, sd = 0.59),
    # c_ldh_u_l_cont
    c_ldh_u_l_cont = stats::rnorm(n = n_total, mean = 1.69, sd = 0.05),
    # c_hr_cont
    c_hr_cont = stats::rnorm(n = n_total, mean = 4.43, sd = 0.04),
    # c_sbp_cont
    c_sbp_cont = stats::rnorm(n = n_total, mean = 4.85, sd = 0.09),
    # c_oxygen_cont
    c_oxygen_cont = stats::rnorm(n = n_total, mean = 97.00, sd = 0.01),
    # c_neutrophil_lymphocyte_ratio_cont
    c_neutrophil_lymphocyte_ratio_cont = stats::rnorm(n = n_total, mean = 1.29, sd = 0.38),
    # c_bmi_cont
    c_bmi_cont = stats::rnorm(n = n_total, mean = 3.23, sd = 0.13),
    # c_ast_alt_ratio_cont
    c_ast_alt_ratio_cont = stats::rnorm(n = n_total, mean = 0.12, sd = 0.29),
    # c_time_dx_to_index
    c_time_dx_to_index = stats::rnorm(n = n_total, mean = 44, sd = 17),
    # c_de_novo_mets_dx
    c_de_novo_mets_dx = sample(c(TRUE, FALSE), size = n_total, replace = TRUE, prob = c(.79, .21)),
    # c_height_cont
    c_height_cont = stats::rnorm(n = n_total, mean = 1.65, sd = 0.08),
    # c_weight_cont
    c_weight_cont = stats::rnorm(n = n_total, mean = 69, sd = 11),
    # c_dbp_cont
    c_dbp_cont = stats::rnorm(n = n_total, mean = 76, sd = 6),
    # c_year_index
    c_year_index = factor(sample(c("<2018", "2018+"), size = n_total, replace = TRUE, prob = c(.95, .05)))
    )

  # assign treatment probabilities
  cohort <- cohort |>
    dplyr::mutate(
      # treat (to simplify categorical variables don't get betas)
      treat = stats::rbinom(
        n = n_total,
        size = 1,
        prob = locfit::expit(
          -5.8 +
            dem_age_index_cont * log(1.0071) +
            dem_sex_cont * log(0.9804) +
            c_smoking_history * log(0.7065) +
            c_number_met_sites * log(0.9982) +
            c_hemoglobin_g_dl_cont * log(1.0483) +
            c_urea_nitrogen_mg_dl_cont * log(0.9198) +
            c_platelets_10_9_l_cont * log(0.9998) +
            c_calcium_mg_dl_cont * log(0.9118) +
            c_glucose_mg_dl_cont * log(1.0954) +
            c_lymphocyte_leukocyte_ratio_cont * log(1.1192) +
            c_alp_u_l_cont * log(1.3126) +
            c_protein_g_l_cont * log(1.0002) +
            c_alt_u_l_cont * log(0.9679) +
            c_albumin_g_l_cont * log(1.0296) +
            c_bilirubin_mg_dl_cont * log(0.9126) +
            c_chloride_mmol_l_cont * log(1.0076) +
            c_monocytes_10_9_l_cont * log(1.3919) +
            c_eosinophils_leukocytes_ratio_cont * log(0.9567) +
            c_ldh_u_l_cont * log(0.8804) +
            c_hr_cont * log(1.2264) +
            c_sbp_cont * log(1.0647) +
            c_oxygen_cont * log(1.00064) +
            c_ecog_cont * log(0.7857) +
            c_neutrophil_lymphocyte_ratio_cont * log(1.0758) +
            c_bmi_cont * log(1.061) +
            c_ast_alt_ratio_cont * log(1.0756) +
            c_stage_initial_dx_cont * log(1.2177) +
            c_time_dx_to_index * log(1.0001)
        )
      )
    )

  #summary(cohort$treat)

  # simulate HTE by race

  ## Asian patients
  cohort_Asian <- cohort |>
    dplyr::filter(dem_race == "Asian")

  # assign betas for hazard model
  betas_os_Asian <- c(
    treat = log(1),
    dem_sex_cont = log(0.79),
    dem_age_index_cont = log(1.02),
    c_smoking_history = log(.70),
    c_ecog_cont = log(.7)
    )

  set.seed(seed)
  cohort_outcome_Asian <- cohort_Asian |>
    dplyr::bind_cols(
      simsurv::simsurv(
        dist = "weibull",
        gammas = 1.2, # shape parameter
        lambdas = 0.01, # scale parameter
        betas = betas_os_Asian, # betas for the model
        x = cohort_Asian, # data frame with covariates
        maxt = 120
        )
    ) |>
    dplyr::select(-id) |>
    dplyr::rename(
      fu_itt_months = eventtime,
      death_itt = status
    )

  ## Non-Asian patients
  cohort_nonAsian <- cohort |>
    dplyr::filter(dem_race != "Asian")

  betas_os_nonAsian <- c(
    treat = log(0.54),
    dem_sex_cont = log(0.79),
    dem_age_index_cont = log(1.02),
    c_smoking_history = log(.70),
    c_ecog_cont = log(.7)
    )

  set.seed(seed)
  cohort_outcome_nonAsian <- cohort_nonAsian |>
    dplyr::bind_cols(
      simsurv::simsurv(
        dist = "weibull",
        gammas = 1.2,
        lambdas = 0.01,
        betas = betas_os_nonAsian,
        x = cohort_nonAsian,
        maxt = 120
      )
    ) |>
    dplyr::select(-id) |>
    dplyr::rename(
      fu_itt_months = eventtime,
      death_itt = status
    )

  # combine Asian and non-Asian
  cohort_outcome <- rbind(cohort_outcome_Asian, cohort_outcome_nonAsian)

  # summary(cohort_outcome$death_itt)
  # summary(cohort_outcome$fu_itt_months)
  # hist(cohort_outcome$fu_itt_months)

  if(!include_id) cohort_outcome <- cohort_outcome |> dplyr::select(-patientid)

  if(imposeNA){

    varComplete <-  c("patientid", "treat", "c_year_index", "fu_itt_months", "death_itt", "dem_age_index_cont", "dem_sex_cont")
    varNA <- setdiff(names(cohort_outcome), varComplete)

    # define column numbers for pattern and weight determination
    colsNA <- which(colnames(cohort_outcome) %in% varNA)

    # define missingness pattern
    default_pattern <- rep(1, ncol(cohort_outcome))
    pattern <- replace(default_pattern, colsNA, 0)

    set.seed(seed)
    cohort_outcome <- mice::ampute(
      data = cohort_outcome,
      prop = propNA,
      patterns = pattern,
      mech = "MCAR"
      )$amp

  }

  return(cohort_outcome)

}


#' Estimate raking weights for a mimids or wimids object
#'
#' @description
#' Function estimates raking weights for multiple imputed and matched (\code{mimids})
#' or weighted (\code{wimids}) datasets. That is to match the distributions of
#' certain variables in the imputed and matched/weighted datasets to a distribution of
#' a target population (e.g., clinical trial population).
#'
#' @details
#' The function requires an object of class mimids or wimids (\code{x}), which is the output
#' of a workflow that requires imputing multiple (m) datasets using mice or amelia
#' and matching or weighting each imputed dataset via the MatchThem package
#' (see examples).
#'
#' The function additionally requires a list of target distributions (\code{targets}) for each variable
#' that is considered for the raking procedure. The list should contain named vectors with the target
#' distributions for each variable and the names of the vectors should match the variable names in the imputed datasets.
#'
#' In brief, the raking procedure iteratively adjusts the weights to make the weighted sample percentages match the target
#' population percentages for the selected variables.It does this by multiplying the current weight for each case by a
#' factor based on the ratio of the target population proportion to the weighted sample proportion for a given category.
#' This adjustment is performed sequentially for each category of each selected variable. Because adjusting for one
#' variable can disrupt the match for previous variables, the process is repeated through all selected variables in cycles.
#' This iterative process minimizes the Kullback-Leibler (KL) divergence and continues until the weighted sample proportions
#' match the target population proportions for all categories ("full convergence"), or until no further change occurs.
#'
#' The function follows the following logic:
#' 1. Extract the ith imputed dataset from the mimids or wimids object
#' 2. Create a temporary case/patient ID
#' 3. Apply the \code{\link[anesrake]{anesrake}} function to the ith imputed dataset
#' 4. Create a temporary dataframe with the case ID and replace the initial weights with the updated raking weights
#' 5. Merge the temporary dataframe with the ith imputed dataset
#' 6. Drop the temporary case ID
#' 7. Return the ith imputed dataset with the raking weights
#'
#' The function returns a list of data frames with the updated raking weights. These
#' updated raking weights overwrite in each data frame the existing \code{weights} column.
#' This column can then be used in a downstream analysis (e.g., Kaplan-Meier, Cox proportional hazards regression).
#'
#' @seealso
#' \code{\link[anesrake]{anesrake}} \code{\link[MatchThem]{matchthem}} \code{\link[MatchThem]{weightthem}}
#'
#' @param x imputed and matched (mimids) or weighted (wimids) object
#' @param targets list of all target values for the raking procedure
#'
#' @return list of data frames with updated raking weights (\code{raking_weights})
#'
#' @md
#'
#' @export
#'
#' @examples
#'
#'  library(encore.analytics)
#'  library(mice)
#'  library(dplyr)
#'  library(MatchThem)
#'  library(survival)
#'
#'  # simulate a cohort with 1,000 patients with 20% missing data
#'  data <- simulate_data(
#'    n = 500,
#'    imposeNA = TRUE,
#'    propNA = 0.2
#'    ) |>
#'    # anesrake works best with factor variables
#'    mutate(c_smoking_history = factor(ifelse(c_smoking_history == TRUE, "Current/former", "Never")))
#'
#'  # impute the data (create mids object)
#'  set.seed(42)
#'  mids <- mice(data, m = 5, print = FALSE)
#'
#'  # define covariates for propensity score model
#'  covariates <- data |>
#'   select(starts_with("c_"), starts_with("dem_")) |>
#'    colnames()
#'
#'  # define propensity score model
#'  fit <- as.formula(paste("treat ~", paste(covariates, collapse = " + ")))
#'
#'  # match patients within each imputed dataset
#'  mimids <- matchthem(
#'    formula = fit,
#'    datasets = mids,
#'    approach = 'within',
#'    method = 'nearest'
#'    )
#'
#'  smoker_target <- c(.35, .65)
#'  names(smoker_target) <- c("Current/former", "Never")
#'
#'  # summarize target distributions in a named list vector
#'  targets <- list(smoker_target)
#'  names(targets) <- c("c_smoking_history")
#'
#'  # estimate raking weights
#'  mirwds <- raking_weights(
#'    x = mimids,
#'    targets = targets
#'    )
#'
raking_weights <- function(x,
                           targets = NULL
                           ){

  # input checks----

  # check if object is a mimids or wimids object
  assertthat::assert_that(inherits(x, c("mimids", "wimids")), msg = "<object> needs to be a mimids or wimids object")
  # check if targets is a list
  assertthat::assert_that(is.list(targets), msg = "<targets> needs to be a list")
  # check if targets is a named list
  assertthat::assert_that(all(names(targets) != ""), msg = "<targets> needs to be a named list")
  # check if targets is a list of named vectors
  assertthat::assert_that(all(sapply(targets, is.numeric)), msg = "<targets> needs to be a list of named vectors")
  # check if targets is a list of named vectors
  assertthat::assert_that(all(sapply(targets, function(x) all(names(x) != ""))), msg = "<targets> needs to be a list of named vectors")

  # extract imputed and matched/weighted datasets and weights ----

  # create a function to indicate which cases should be re-weighted
  # since we are only interested in matched/weighted cases
  estimate_raking_weights <- function(i){

    # next, we create a list of datasets to apply the raking procedure one-by-one
    # action = "all"  : produces a mild object (list) of the multiply imputed datasets
    # all = FALSE     : do NOT include observations with a zero estimated weight
    # include = FALSE : do NOT include original data with the missing values

    # Important: the all = parameter needs to be set to FALSE
    # since otherwise unmatched patients or those zero weight
    # will be included, too!
    data <- MatchThem::complete(x, action = i, all = FALSE, include = FALSE)

    # create a temporary case ID
    data <- data |>
      dplyr::mutate(caseid = seq(1, nrow(data), 1))

    # apply anesrake function to re-weight -----------------------------------
    anesrake_out <- anesrake::anesrake(
      # target distributions for raking procedure
      inputter = targets,
      # data to be weighted
      dataframe = data,
      # patient identifier
      caseid = data$caseid,
      # other weights that should be accounted for before re-weighting is conducted
      # for matching this is 0 (unmatched) and 1 (matched) and for weighting this is the actual weights
      weightvec = data$weights,
      # no verbosity
      verbose = FALSE
      )

    # create a temporary dataframe with id and sampling/re-weighting weights
    caseweights <- data.frame(caseid = anesrake_out$caseid, weights = anesrake_out$weightvec)

    data_return <- data |>
      # remove "old" weights with new weights
      dplyr::select(-weights) |>
      dplyr::left_join(caseweights, by = "caseid") |>
      # now we can drop the temporary id
      dplyr::select(-caseid)

    return(data_return)

  }

  # return list of imputed, matched/weighted and re-weighted datasets (mirwds)
  mirwds <- lapply(
    X = 1:length(x$models),
    FUN = estimate_raking_weights
    )

  return(mirwds)

}

#' Manually fit and pool Cox proportional hazards model results from multiple imputed datasets
#'
#' @description
#' This function manually fits and pools results from Cox proportional hazards models using
#' a list of imputed datasets. It leverages the `mice::as.mira` and `mice::pool` functions to
#' ensure proper pooling of results across multiple imputations using Rubin's rules.
#'
#' @details
#' The function requires a list of imputed data frames with weights and optional cluster (matching) information,
#' as well as a formula for the Cox proportional hazards model. The data frames must include a column named
#' \code{weights}, and optionally a column named \code{subclass} (for matched datasets to indicate cluster membership).
#'
#' This function is particularly useful when working with imputed datasets that are not in the form of
#' \code{mimids} or \code{wimids} objects, such as when intermediate steps like raking weights
#' (via \code{raking_weights}) are applied. It provides a flexible way to fit and pool Cox models
#' while ensuring compatibility with Rubin's rules for multiple imputation.
#'
#' The function follows these steps:
#' 1. Fit a Cox proportional hazards model to each imputed dataset. If a \code{subclass} column is present,
#'    it is used as a cluster variable for matched pairs.
#' 2. Convert the list of fitted models into a \code{mira} object using \code{mice::as.mira}.
#' 3. Pool the results using \code{mice::pool}, which applies Rubin's rules for combining estimates
#'    and variances across imputations.
#' 4. Format the pooled results, including exponentiating the hazard ratios and calculating confidence intervals.
#'
#' @seealso
#' \code{\link[survival]{coxph}}, \code{\link[mice]{pool}}, \code{\link[mice]{as.mira}},
#' \code{\link[MatchThem]{matchthem}}, \code{\link[MatchThem]{weightthem}}
#'
#' @param x A list of imputed datasets with weights or raking weights (e.g., from \code{\link[encore.analytics]{raking_weights}})
#'          and optional cluster information (for matched datasets).
#' @param surv_formula A formula for the Cox proportional hazards model (default is \code{Surv(fu_itt_months, death_itt) ~ treat}).
#'
#' @return A data frame containing the pooled results, including hazard ratios, confidence intervals, and p-values.
#'
#' @md
#'
#' @export
#'
#' @examples
#' library(encore.analytics)
#' library(mice)
#' library(MatchThem)
#'
#' # Simulate a cohort with 500 patients and 20% missing data
#' data <- simulate_data(
#'   n = 500,
#'   imposeNA = TRUE,
#'   propNA = 0.2
#' )
#'
#' # Impute the data
#' set.seed(42)
#' mids <- mice(data, m = 5, print = FALSE)
#'
#' # Fit a propensity score model
#' fit <- as.formula(treat ~ dem_age_index_cont + dem_sex_cont + c_smoking_history)
#'
#' # Weight patients within each imputed dataset
#' wimids <- weightthem(
#'   formula = fit,
#'   datasets = mids,
#'   approach = "within",
#'   method = "glm",
#'   estimand = "ATO"
#' )
#'
#' # Create a list of imputed and weighted datasets
#' wimids_list <- MatchThem::complete(wimids, action = "all", all = FALSE, include = FALSE)
#'
#' # Define a survival model formula
#' cox_fit <- as.formula(survival::Surv(fu_itt_months, death_itt) ~ treat)
#'
#' # Fit and pool Cox proportional hazards model results
#' cox_pooling(wimids_list, surv_formula = cox_fit)
cox_pooling <- function(x,
                        surv_formula = stats::as.formula(survival::Surv(fu_itt_months, death_itt) ~ treat)
                        ){

  # input checks----

  # check if object is a list
  assertthat::assert_that(inherits(x, "list") | inherits(x, "mild"), msg = "<cox_results_list> needs to be a list or mild object")
  # check if surv_formula is a formula
  assertthat::assert_that(inherits(surv_formula, "formula"), msg = "<surv_formula> needs to be a formula")
  # check if weights_col is in the data
  df_colnames <- colnames(x[[1]])
  assertthat::assert_that("weights" %in% df_colnames, msg = "<weights> not in <x>")

  # Step 1: fit Cox model to each imputed dataset----
  cox_fit <- function(i){

    # if cluster_col is in the data (matched datasets)
    if("subclass" %in% df_colnames){

      cox <- survival::coxph(
        data = x[[i]],
        formula = surv_formula,
        weights = weights,
        cluster = subclass,
        robust = TRUE
        )

    }else{

      cox <- survival::coxph(
        data = x[[i]],
        formula = surv_formula,
        weights = weights,
        robust = TRUE
        )

      }

    return(cox)


  }

  # loop cox_fit function over each imputed dataset
  cox_results_list <- lapply(
    X = 1:length(x),
    FUN = cox_fit
    )

  # Step 2: extract results into a format suitable for pooling
  # convert each model into a "mira" object (required for pooling)
  mira_object <- mice::as.mira(cox_results_list)

  # Step 3: pool the results using Rubin's rules
  pooled_results <- mice::pool(mira_object)

  # Step 4: format the pooled results
  cox_result <- broom::tidy(pooled_results, exponentiate = TRUE, conf.int = TRUE)

  return(cox_result)


}

#' Wrapper around \code{gtsummary::tbl_summary()} to create a beautiful Table 1 quickly
#'
#' @description
#' Builds a table 1 using the \code{gtsummary} package. The function takes a one-row-per-patient dataframe
#' and creates a summary table of covariates stratified by treatment group.
#'
#' @details
#' Wrapper for \code{\link[gtsummary]{tbl_summary}}.
#'
#' The function \code{create_table1} is a wrapper around the \code{gtsummary::tbl_summary()} function.
#' It is designed to create a summary table of covariates stratified by treatment group.
#' The function requires a one-row-per-patient dataframe and creates a summary table of covariates stratified by treatment group.
#'
#' @importFrom smd smd
#'
#' @param x dataframe with individual-level patient data in a one-row-per-patient format with treatment stratification variable and covariates to be displayed in the Table 1
#' @param covariates character vector of columns/covariate names to be displayed in Table 1
#' @param covariates_labels named character vector or list of formulas specifying variables labels of covariate-label pairs to display in table
#' @param treat character specifying column name of treatment variable
#' @param explicit_na_categorical logical, should missings in categorical variables be explicitly included as a separate category (default is TRUE)
#'
#' @return object of class "tbl_summary" "gtsummary"
#'
#' @export
#'
#' @examples
#' \dontrun{
#' library(encore.analytics)
#'
#' # simulate a cohort with 1,000 patients with 20% missing data
#' data <- simulate_data(
#'   n = 1000,
#'   imposeNA = TRUE,
#'   propNA = 0.2
#'   )
#'
#' # create a Table 1
#' create_table1(
#'   x = data,
#'   covariates = c("dem_age_index_cont", "dem_sex_cont", "c_smoking_history"),
#'   covariates_labels = list(
#'     "dem_age_index_cont" = "Age",
#'     "dem_sex_cont" = "Sex",
#'     "c_smoking_history" = "Smoking history"
#'     ),
#'   treat = "treat"
#'   )
#'}
create_table1 <- function(x = NULL,
                          covariates = NULL,
                          covariates_labels = NULL,
                          treat = "treat",
                          explicit_na_categorical = TRUE
                          ){

  install_on_demand("cardx")
  install_on_demand("smd")

  # check input
  assertthat::assert_that(any(class(x) %in% c("data.frame", "tibble")), msg = "<x> is not a data.frame or tibble")
  assertthat::assert_that(all(covariates %in% colnames(x)), msg = "some or all of <covariates> not in <x>")
  assertthat::assert_that(is.list(covariates_labels) | is.null(covariates_labels), msg = "<covariates_labels> needs to be a named character vector or list of formulas specifying variables labels")
  assertthat::assert_that(length(covariates_labels) == length(covariates) | is.null(covariates_labels), msg = "<covariates_labels> must be a named character vector or list of formulas specifying variables labels")
  assertthat::assert_that(length(treat) == 1, msg = "<treat> must be a single character string")
  assertthat::assert_that(treat %in% colnames(x), msg = "<treat> not in <x>")
  assertthat::assert_that(is.logical(explicit_na_categorical), msg = "<explicit_na_categorical> must be a logical TRUE or FALSE")
  assertthat::assert_that(length(covariates) > 0, msg = "<covariates> must be a non-empty character vector")

  # assign treatment variable
  # and subset columns
  x[["strata"]] <- x[[treat]]
  x <- x[c(covariates, "strata")]

  if(explicit_na_categorical){

    # encode NA explicitly as "Missing" (only applicable to categorical)
    x <- x |>
      # logicals into categorical
      dplyr::mutate(
        dplyr::across(
          dplyr::where(function(.x) is.logical(.x)),
          .fns = function(.x) factor(.x, levels = c("TRUE", "FALSE", "Missing"))
          )
        ) |>
      # for all categorical, assign a "Missing" category
      dplyr::mutate(
        dplyr::across(
          dplyr::where(function(.x) is.character(.x) | is.factor(.x)),
          .fns = function(.x) forcats::fct_na_value_to_level(.x, level = "Missing")
          )
        ) |>
      # drop "Missing" level for covariates which are fully observed/have no NA
      droplevels()
  }

  table1 <- x |>
    gtsummary::tbl_summary(
      by = "strata",
      missing = "ifany",
      missing_text = "Missing",
      label = covariates_labels
      ) |>
    gtsummary::add_difference(test = dplyr::everything() ~ "smd") |>
    gtsummary::modify_column_hide(conf.low) |>
    gtsummary::add_overall() |>
    gtsummary::modify_header(
      label ~ "**Patient characteristic**",
      stat_0 ~ "**Total** <br> N = {N}",
      stat_1 ~ "**{level}** <br> N = {n} <br> ({style_percent(p, digits=1)}%)",
      stat_2 ~ "**{level}** <br> N = {n} <br> ({style_percent(p, digits=1)}%)"#,
      #test_result ~ "**SMD**"
      ) |>
    gtsummary::modify_spanning_header(c("stat_1", "stat_2") ~ "**Treatment received**")

  return(table1)

}

#' Calculate Agreement Metrics Between RCT and RWE Results
#'
#' @description Creates a formatted table comparing results from randomized controlled trials (RCTs)
#' with real-world evidence (RWE) emulation studies by calculating agreement metrics.
#'
#' @details
#' Calculates three types of agreement metrics:
#' - Statistical significance agreement: Same direction and statistical significance
#' - Estimate agreement: RWE estimate within RCT confidence interval
#' - SMD agreement: Standardized mean difference below threshold (default 1.96)
#'
#' All estimates should be hazard ratios (HR) and must be positive.
#' Estimates are log-transformed for SMD calculation.
#'
#' @importFrom tidyr pivot_wider pivot_longer
#' @importFrom smd smd
#'
#' @param x A data.frame or tibble containing RCT and RWE results (in wide format, see examples).
#' Must contain these columns:
#'
#' - rct_estimate = Point estimate from the RCT
#' - rct_lower = Lower confidence limit of the RCT result
#' - rct_upper = Upper confidence limit of the RCT result
#' - rwe_estimate = Point estimate from the RWE emulation study
#' - rwe_lower = Lower confidence limit of the RWE result
#' - rwe_upper = Upper confidence limit of the RWE result
#'
#' @param analysis_col Character. Name of column identifying different analyses
#' @param group_col Character. Optional. Name of column for grouping results (e.g., database)
#' @param estimate_label Character. Label for estimates in table header. Default: "HR (95% CI)"
#' @param smd_threshold Numeric. Threshold for SMD agreement. Default: 1.96 (α=0.05)
#'
#' @return A gt table object with formatted agreement metrics
#'
#' @md
#'
#' @examples
#' # Example 1: One analysis per database with grouping
#'
#'x <- tibble::tribble(
#'~Analysis, ~Database, ~rct_estimate, ~rct_lower, ~rct_upper, ~rwe_estimate, ~rwe_lower, ~rwe_upper,
#'"Main analysis", "Database 1", 0.87, 0.78, 0.97, 0.82, 0.76, 0.87,
#'"Main analysis", "Database 2", 0.5, 0.4, 0.6, 2.0, 1.8, 2.2,
#'"Main analysis", "Database 3", 0.8, 0.7, 0.9, 1.5, 1.4, 1.6
#')
#'
#' agreement_metrics(x, analysis_col = "Analysis", group_col = "Database")
#'
#' # Example 2:  Three analyses, one database without grouping
#'
#' x <- tibble::tribble(
#' ~Analysis, ~rct_estimate, ~rct_lower, ~rct_upper, ~rwe_estimate, ~rwe_lower, ~rwe_upper,
#' "Main analysis", 0.87, 0.78, 0.97, 0.82, 0.76, 0.87,
#' "Sensitivity analysis 1", 0.5, 0.4, 0.6, 2.0, 1.8, 2.2,
#' "Sensitivity analysis 2", 0.8, 0.7, 0.9, 1.5, 1.4, 1.6
#' )
#'
#' agreement_metrics(x, analysis_col = "Analysis")
#'
#' # Example 3: Three analyses per database with grouping
#' x <- tibble::tribble(
#' ~Analysis, ~Database, ~rct_estimate, ~rct_lower, ~rct_upper, ~rwe_estimate, ~rwe_lower, ~rwe_upper,
#' "Main analysis", "Database 1", 0.87, 0.78, 0.97, 0.82, 0.76, 0.87,
#' "Main analysis", "Database 2", 0.5, 0.4, 0.6, 2.0, 1.8, 2.2,
#' "Main analysis", "Database 3", 0.8, 0.7, 0.9, 1.5, 1.4, 1.6,
#' "Sensitivity analysis 1", "Database 1", 0.5, 0.4, 0.6, 2.0, 1.8, 2.2,
#' "Sensitivity analysis 1", "Database 2", 0.8, 0.7, 0.9, 1.5, 1.4, 1.6,
#' "Sensitivity analysis 1", "Database 3", 0.87, 0.78, 0.97, 0.82, 0.76, 0.87,
#' "Sensitivity analysis 2", "Database 1", 0.87, 0.78, 0.97, 0.82, 0.76, 0.87,
#' "Sensitivity analysis 2", "Database 2", 0.5, 0.4, 0.6, 2.0, 1.8, 2.2,
#' "Sensitivity analysis 2", "Database 3", 0.8, 0.7, 0.9, 1.5, 1.4, 1.6
#' )
#'
#' agreement_metrics(x, analysis_col = "Analysis", group_col = "Database")
#'
#' @export
agreement_metrics <- function(x,
                              analysis_col,
                              group_col = NULL, # subgroup, e.g, database
                              estimate_label = "HR (95% CI)",
                              smd_threshold = 1.96 # customizable threshold for SMD agreement
                              ){

  # input checks
  assertthat::assert_that(any(class(x) %in% c("data.frame", "tibble")), msg = "<x> is not a data.frame or tibble")
  assertthat::assert_that(all(c("rct_estimate", "rct_lower", "rct_upper", "rwe_estimate", "rwe_lower", "rwe_upper") %in% colnames(x)), msg = "<x> does not contain all required columns")
  assertthat::assert_that(!is.null(analysis_col), msg = "<analysis_col> cannot be NULL")
  assertthat::assert_that(analysis_col %in% colnames(x), msg = sprintf("<analysis_col> '%s' not found in <x>", analysis_col))
  if(!is.null(group_col)) assertthat::assert_that(group_col %in% colnames(x), msg = "<group_col> not in <x>")

  # check for non-positive values
  assertthat::assert_that(all(x$rct_estimate > 0), msg = "RCT estimates must be positive for hazard ratios")
  assertthat::assert_that(all(x$rwe_estimate > 0), msg = "RWE estimates must be positive for hazard ratios")

  # if no smd_value present in the results table, calculate it
  if(!"smd_value" %in% colnames(x)){
    x <- x |>
      dplyr::rowwise() |>
      dplyr::mutate(
        smd_value = tryCatch({
          smd_agreement(
            rct_estimate = log(rct_estimate),
            rct_lower = log(rct_lower),
            rct_upper = log(rct_upper),
            rwe_estimate = log(rwe_estimate),
            rwe_lower = log(rwe_lower),
            rwe_upper = log(rwe_upper)
          )
        }, error = function(e) {
          warning(sprintf("SMD calculation failed: %s", e$message))
          NA_real_
        })
      ) |>
      dplyr::ungroup()
  }

  # calculate agreement metrics
  x <- x |>
    dplyr::mutate(
      # significance agreement with additional categories
      significance_agreement = dplyr::case_when(
        # superiority
        rct_estimate < 1 & rct_upper < 1 & rwe_estimate < 1 & rwe_upper < 1 ~ "Yes",
        # non-inferiority
        rct_estimate < 1 & rct_upper >= 1 & rwe_estimate < 1 & rwe_upper >= 1 ~ "Yes",
        # inferiority
        rct_estimate >= 1 & rct_lower >= 1 & rwe_estimate >= 1 & rwe_upper >= 1 ~ "Yes",
        # all other cases
        TRUE ~ "No"
        ),

      # estimate agreement
      estimate_agreement = dplyr::if_else(
        rwe_estimate >= rct_lower & rwe_estimate <= rct_upper, "Yes", "No"
        ),

      # smd agreement using parameter
      smd_agreement = dplyr::case_when(
        is.na(smd_value) ~ "NA",
        abs(smd_value) < smd_threshold ~ "Yes",
        TRUE ~ "No"
        )
      )

  # format table for display
  x_format <- x |>
    dplyr::mutate(dplyr::across(where(is.numeric), ~ format(.x, digits = 2, nsmall = 2))) |>
    dplyr::mutate(
      RCT = glue::glue("{rct_estimate} ({rct_lower} - {rct_upper})"),
      RWE = glue::glue("{rwe_estimate} ({rwe_lower} - {rwe_upper})"),
      smd_agreement = glue::glue("{smd_agreement} ({smd_value})")
      ) |>
    dplyr::select(dplyr::all_of(c(analysis_col, group_col)), "RCT", "RWE", "significance_agreement", "estimate_agreement", "smd_agreement")

  # conditional grouping
  if (!is.null(group_col)) {
    x_format <- x_format |>
      dplyr::group_by(dplyr::across(dplyr::all_of(group_col)))
  }

  # create and format gt table
  x_gt <- x_format |>
    gt::gt() |>
    gt::tab_spanner(
      label = estimate_label,
      columns = c("RCT", "RWE")
      ) |>
    gt::cols_label(
      significance_agreement = gt::md("Statistical <br> significance <br> agreement"),
      estimate_agreement = gt::md("Estimate <br> agreement"),
      smd_agreement = "SMD"
      )

  # apply styles for agreement columns
  for (col in c("significance_agreement", "estimate_agreement", "smd_agreement")) {
    x_gt <- x_gt |>
      gt::tab_style(
        style = gt::cell_text(color = "darkgreen"),
        locations = gt::cells_body(
          columns = !!col,
          rows = if (col == "smd_agreement") {
            stringr::str_detect(x_format[[col]], "Yes")
          } else {
            x_format[[col]] == "Yes"
          }
        )
      ) |>
      gt::tab_style(
        style = gt::cell_text(color = "darkred"),
        locations = gt::cells_body(
          columns = !!col,
          rows = if (col == "smd_agreement") {
            stringr::str_detect(x_format[[col]], "No")
          } else {
            x_format[[col]] == "No"
          }
        )
      )
  }

  # footnote estimate label
  if(estimate_label == "HR (95% CI)") footnote_label <- "HR = Hazard ratio"

  # Add bold styling for headers
  x_gt <- x_gt |>
    gt::tab_style(
      style = gt::cell_text(weight = "bold"),
      locations = gt::cells_column_labels()
      ) |>
    gt::tab_style(
      style = gt::cell_text(weight = "bold"),
      locations = gt::cells_column_spanners()
      ) |>
    gt::tab_style(
      style = gt::cell_text(weight = "bold"),
      locations = gt::cells_row_groups()
      ) |>
    gt::tab_footnote(
      glue::glue("Abbreviations: CI = Confidence interval, {footnote_label}, RCT = Randomized controlled trial, RWE = Real-world evidence, SMD = standardized mean difference (based on log hazard ratios)")
      )

  return(x_gt)
}

#' Pooled Kaplan-Meier estimate and survival curve
#'
#' @description
#' Computes pooled median survival Kaplan-Meier estimates using Rubin's rule
#' and outputs corresponding Kaplan-Meier curve across imputed and
#' matched/weighted datasets
#'
#' @details
#' The function requires an object of class mimids or wimids (\code{x}), which is the output
#' of a workflow that requires imputing multiple (m) datasets using mice or amelia
#' and matching or weighting each imputed dataset via the MatchThem package
#' (see examples).
#'
#' The function fits the pre-specified survfit model (\code{surv_formula}, \code{\link[survival]{survfit}} package)
#' to compute survival probabilities at each individual time point according to the Kaplan-Meier method.
#' For matched and weighted datasets, weights, cluster membership (matching only) and robust
#' variance estimates are considered in the \code{\link[survival]{survfit}} call by default.
#'
#' Since survival probabilities typically don't follow normal distributions,
#' these need to be transformed to approximate normality first before pooling
#' across imputed datasets and time points. To that end, survival probabilities are first
#' transformed using a complementary log-log transformation (\code{log(-log(1-pr(surv)))})
#' as recommended by multiple sources (Marshall, Billingham, and Bryan (2009)).
#'
#' To pool the transformed estimates across imputed datasets and time points, the pool.scalar
#' function is used to apply Rubin's rule to combine pooled estimates (qbar) according to
#' formula (3.1.2) Rubin (1987) and to compute the corresponding total variance (t) of the pooled
#' estimate according to formula (3.1.5) Rubin (1987).
#'
#' The pooled survival probabilities are then back-transformed via \code{1-exp(-exp(qbar))}
#' for pooled survival probability estimates and \code{1-exp(-exp(qbar +/- 1.96*sqrt(t)))}
#' for lower and upper 95% confidence intervals. As the formula indicates, the pooled standard
#' error is computed as the square root of the total variance. The vertically stacked table
#' with transformed and backtransformed estimates is returned with the
#' \code{km_survival_table} table.
#'
#' Finally, the median survival time is extracted from the \code{km_survival_table} table
#' by determining the time the survival probability drops below .5 for the first time.
#' For this a sub-function of Terry M. Therneau's \code{\link[survival]{print.survfit}} function
#' is used. Therneau also considers some edge cases/nuisances (x = time, y = surv):
#' - Nuisance 1: if one of the y's is exactly .5, we want the mean of the corresponding x and the first x for which y<.5. We need to use the equivalent of all.equal to check for a .5 however: survfit(Surv(1:100)~1) gives a value of .5 + 1.1e-16 due to roundoff error.
#' - Nuisance 2: there may by an NA in the y's
#' - Nuisance 3: if no y's are <=.5, then we should return NA
#' - Nuisance 4: the obs (or many) after the .5 may be censored, giving a stretch of values = .5 +- epsilon
#'
#' The function follows the following logic:
#' 1. Fit Kaplan-Meier survival function to each imputed and matched/weighted dataset
#' 2. Transform survival probabilities using complementary log-log transformation
#' 3. Pool transformed survival probabilities and compute total variance using Rubin's rule
#' 4. Back-transform pooled survival probabilities and compute 95% confidence intervals
#' 5. Extract median survival time and corresponding 95% confidence intervals
#' 6. Plot Kaplan-Meier curve with pooled survival probabilities and confidence intervals
#'
#' More references:
#' -  https://stefvanbuuren.name/fimd/sec-pooling.html
#' -  https://link.springer.com/article/10.1007/s10198-008-0129-y
#' -  https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-015-0048-4
#'
#' @seealso
#' \code{\link[survival]{survfit}} \code{\link[mice]{pool.scalar}} \code{\link[MatchThem]{matchthem}} \code{\link[MatchThem]{weightthem}}
#'
#' @param x imputed and matched (mimids) or weighted (wimids) object
#' @param surv_formula specification of survival model formula to be fitted
#'
#' @return list with pooled median survival estimate and pooled Kaplan-Meier curve
#' \code{km_median_survival}:
#' -  strata = stratum
#' -  t_median = median survival time
#' -  t_lower = lower 95% CI of median survival time
#' -  t_upper = upper 95% CI of median survival time
#'
#' \code{km_survival_table}:
#' -  strata = stratum
#' -  time = observed time point
#' -  m = number of imputed datasets
#' -  qbar = pooled univariate estimate of the complementary log-log transformed survival probabilities, see formula (3.1.2) Rubin (1987)
#' -  t = total variance of the pooled univariate estimate of the complementary log-log transformed survival probabilities, formula (3.1.5) Rubin (1987)
#' -  se = total standard error of the pooled estimate (derived as sqrt(t))
#' -  surv = back-transformed pooled survival probability
#' -  lower = Wald-type lower 95% confidence interval of back-transformed pooled survival probability
#' -  upper = Wald-type upper 95% confidence interval of back-transformed pooled survival probability
#'
#' \code{km_plot}: ggplot2 object with Kaplan-Meier curve
#'
#' @md
#'
#' @export
#'
#' @examples
#'  library(encore.analytics)
#'  library(mice)
#'  library(MatchThem)
#'
#'  # simulate a cohort with 1,000 patients with 20% missing data
#'  data <- simulate_data(
#'    n = 500,
#'    imposeNA = TRUE,
#'    propNA = 0.2
#'    )
#'
#'  # impute the data
#'  set.seed(42)
#'  mids <- mice(data, m = 5, print = FALSE)
#'
#'  # fit a propensity score model
#'  fit <- as.formula(treat ~ dem_age_index_cont + dem_sex_cont + c_smoking_history)
#'
#'  # weight (or alternatively match) patients within each imputed dataset
#'  wimids <- weightthem(
#'    formula = fit,
#'    datasets = mids,
#'    approach = "within",
#'    method = "glm",
#'    estimand = "ATO"
#'    )
#'
#'  # fit a survival model
#'  km_fit <- as.formula(survival::Surv(fu_itt_months, death_itt) ~ treat)
#'
#'  # estimate and pool median survival times and Kaplan-Meier curve
#'  km_out <- km_pooling(
#'    x = wimids,
#'    surv_formula = km_fit
#'    )
#'
#'  # median survival time
#'  km_out$km_median_survival
#'
#'  # KM curve
#'  km_out$km_plot
#'
km_pooling <- function(x = NULL,
                       surv_formula = stats::as.formula(survival::Surv(fu_itt_months, death_itt) ~ treat)
                       ){

  # input checks
  # check if x is a mimids or wimids object
  assertthat::assert_that(inherits(x, c("mimids", "wimids")), msg = "<x> needs to be a mimids or wimids object")
  # check if surv_formula is a formula
  assertthat::assert_that(inherits(surv_formula, "formula"), msg = "<surv_formula> needs to be a formula")
  # check weights and subclass
  assertthat::assert_that("weights" %in% names(MatchThem::complete(x)), msg = "<x> needs to contain a weights column")
  # check if weights and subclass are in the data
  if(inherits(x, "mimids")){
    assertthat::assert_that("subclass" %in% names(MatchThem::complete(x)), msg = "<x> needs to contain a weights column")
  }

  # Fit Kaplan Meier --------------------------------------------------------

  # first, we fit a survival function for ith dataset
  compute_km <- function(i){

    # compute the survival function for dataset i

    ## for weighting
    if(inherits(x, "wimids")){

      survfit_fit <- survival::survfit(
        formula = surv_formula,
        weights = weights,
        robust = T,
        data = i
        )

    }

    ## for matching
    if(inherits(x, "mimids")){

      survfit_fit <- survival::survfit(
        formula = surv_formula,
        weights = weights,
        cluster = subclass,
        robust = T,
        data = i
        )

    }

    return(survfit_fit)

  }


  # compute KM across all imputed (and weighted/matched) datasets --------------------

  # next, we create a list of datasets to apply the compute_km function to
  # action = "all"  : produces a mild object (list) of the multiply imputed datasets
  # all = FALSE     : do NOT include observations with a zero estimated weight
  # include = FALSE : do NOT include original data with the missing values

  # Important: the all = parameter needs to be set to FALSE
  # since otherwise unmatched patients or those zero weight
  # will be included, too!
  object_list <- MatchThem::complete(x, action = "all", all = FALSE, include = FALSE)

  # now apply the compute_km function across all imputed and matched/weighted datasets
  # this creates a list with computed survival probabilities
  survival_fit_list <- lapply(object_list, FUN = compute_km)


  # pool survival probabilities ---------------------------------------------

  # now to combine/pool quantities with non-normal distributions
  # such as survival probabilities, Stef van Buuren
  # recommends to tansform survival probabilities
  # using a complementary log-log transformation
  # https://stefvanbuuren.name/fimd/sec-pooling.html
  # reference: Marshall, Billingham, and Bryan (2009) https://link.springer.com/article/10.1007/s10198-008-0129-y
  # reference: https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-015-0048-4

  # that is, the function cloglog performs the
  # transformations on all survival probabilities
  # stored in survival_fit_list
  cloglog <- function(i){

    # convert to data.frame
    survival_times_df <- with(
      summary(i),
      data.frame(strata, time, surv, std.err, lower, upper)
      ) |>
      dplyr::mutate(
        # compute transformed qbar
        q = log(-log(1-surv)),
        # compute transformed U
        u = std.err^2 / ((1-surv) * log(1-surv))^2
        )

    }

  # we apply this function to all fitted survival curves
  km_survival <- lapply(survival_fit_list, FUN = cloglog) |>

    # the resulting list with q and u per stratum and dataset
    # is then vertically stacked to a long data frame (using bind_rows).
    dplyr::bind_rows() |>

    # group by the exposure levels (stratum) and apply the pool.scalar function
    # which pools univariate estimates of m datasets. This returns:
    # m = The number of imputations
    # qbar = The pooled univariate estimate, formula (3.1.2) Rubin (1987)
    # t = The total variance of the pooled estimated, formula (3.1.5) Rubin (1987).
    dplyr::group_by(strata, time) |>
    dplyr::summarize(
      data.frame(mice::pool.scalar(Q = q, U = u, k = 1)[c("m", "qbar", "t")]),
      .groups = "drop"
      ) |>

    # we back-transform back and compute CI's
    dplyr::mutate(
      surv = 1-exp(-exp(qbar)),
      se = sqrt(t),
      lower = 1-exp(-exp(qbar - 1.96*se)),
      upper = 1-exp(-exp(qbar + 1.96*se))
      ) |>

    # arrange by strata and descending (!) survival probability
    dplyr::arrange(strata, desc(surv))


  # Plot a pooled Kaplan-Meier curve ----------------------------------------

  # plot pooled Kaplan-Meier curve based on pooled survival probabilities

  km_plot <- km_survival |>
    ggplot2::ggplot(ggplot2::aes(x = time, y = surv, color = strata, fill = strata)) +
    ggplot2::geom_line(linewidth = 1.25) +
    ggplot2::geom_ribbon(ggplot2::aes(ymin = lower, ymax = upper), linetype = 0, alpha = 0.2) +
    ggplot2::theme_minimal(base_size = 14) +
    ggplot2::labs(
      x = "Follow-up time",
      y = "Pr(Survival) [%]",
      color = "Exposure",
      fill = "Exposure"
      ) +
    ggplot2::theme(legend.position = "top")


  # Extract median survival time --------------------------------------------

  # This section is taken from Terry M. Therneau's
  # print.survfit function in the survival package!

  # to extract the median survival times, we
  # apply the same sub-functions used in
  # print.survfit function of the original survival package
  # which defines the median survival time as the time
  # the survival probability drops below .5 for the first time
  # see https://github.com/cran/survival/blob/master/R/print.survfit.R#L109

  # Therneau defined some edge cases (x = time, y = surv):
  # Nuisance 1: if one of the y's is exactly .5, we want the mean of
  #    the corresponding x and the first x for which y<.5.  We need to
  #    use the equivalent of all.equal to check for a .5 however:
  #    survfit(Surv(1:100)~1) gives a value of .5 + 1.1e-16 due to
  #    roundoff error.
  # Nuisance 2: there may by an NA in the y's
  # Nuisance 3: if no y's are <=.5, then we should return NA
  # Nuisance 4: the obs (or many) after the .5 may be censored, giving
  #   a stretch of values = .5 +- epsilon
  minmin <- function(y, x){
    tolerance <- .Machine$double.eps^.5   #same as used in all.equal()
    keep <- (!is.na(y) & y <(.5 + tolerance))
    if (!any(keep)) NA
    else {
      x <- x[keep]
      y <- y[keep]
      if (abs(y[1]-.5) <tolerance  && any(y< y[1]))
        (x[1] + x[min(which(y<y[1]))])/2
      else x[1]
    }
  }

  # apply this across strata listed in the km_survival table
  minmin_strata <- function(i){

    km_survival_stratum <- km_survival |>
      dplyr::filter(strata == i)

    # median survival time
    med <- minmin(y = km_survival_stratum$surv, km_survival_stratum$time)

    # 95% CI of median survival time
    # = time when lower and upper
    # 95% confidence intervals of the survival
    # probability drop below .5
    if(!is.null(km_survival_stratum$upper)){
      upper <- minmin(y = km_survival_stratum$upper, x = km_survival_stratum$time)
      lower <- minmin(y = km_survival_stratum$lower, x = km_survival_stratum$time)
      minmin_strata_return <- c(med, lower, upper)

    }else{

      # Therneau assigns a 0 of no confidence intervals are available
      upper <- 0
      lower <- 0

    }

    # provide output table
    minmin_strata_return_tibble <- tibble::tibble(
      strata = i, # stratum
      t_median = med, # median survival time
      t_lower = lower, # lower 95% CI of median survival time
      t_upper = upper # upper 95% CI of median survival time
      )

  }

  km_median_survival <- do.call(rbind, lapply(X = unique(km_survival$strata), FUN = minmin_strata))


  # return results ----------------------------------------------------------

  return(
    list(
      km_median_survival = km_median_survival,
      km_plot = km_plot,
      km_survival_table = km_survival
      )
    )


}

#' Install package on demand
#'
#' @description
#' This function checks if a package is installed. If not, it
#' asks the user if they want to install it. If the user
#' agrees, it installs the package from CRAN.
#'
#' @param pkg a character string with the name of the package
#' @param quiet logical. If TRUE, suppresses messages during
#' @param ... additional arguments passed to \code{install.packages}
#'
install_on_demand <- function(pkg, quiet = FALSE, ...) {
  # internal function that checks whether package pkg is
  # in the library. If not found, it asks the user permission
  # to install.
  if (requireNamespace(pkg, quietly = TRUE)) {
    return()
  }
  if (interactive()) {
    answer <- utils::askYesNo(paste("Package", pkg, "needed. Install from CRAN?"))
    if (answer) utils::install.packages(pkg, quiet = quiet)
  }
}

1		#' Calculate Standardized Mean Difference (SMD) Between RCT and RWE Results
2		#'
3		#' @description
4		#' This function calculates the standardized mean difference (SMD) between results from
5		#' a randomized controlled trial (RCT) and its real-world evidence (RWE) emulation.
6		#' The SMD provides a measure of agreement between the two results, taking into account
7		#' both the point estimates and their corresponding uncertainty.
8		#'
9		#' @details
10		#' The SMD is calculated as the difference between RCT and RWE estimates, divided by
11		#' the square root of the sum of their variances. Variances are derived from the
12		#' confidence intervals assuming normal distribution (using 1.96 for 95% CI).
13		#'
14		#' The formula used is:
15		#' SMD = (θ_RCT - θ_RWE) / sqrt(Var(θ_RCT) + Var(θ_RWE))
16		#'
17		#' @param rct_estimate Numeric. Point estimate from the RCT (typically log hazard ratio or log odds ratio)
18		#' @param rct_lower Numeric. Lower bound of the confidence interval from RCT
19		#' @param rct_upper Numeric. Upper bound of the confidence interval from RCT
20		#' @param rwe_estimate Numeric. Point estimate from the RWE study
21		#' @param rwe_lower Numeric. Lower bound of the confidence interval from RWE
22		#' @param rwe_upper Numeric. Upper bound of the confidence interval from RWE
23		#'
24		#' @return
25		#' Returns a numeric value representing the standardized mean difference.
26		#' Larger absolute values indicate greater disagreement between RCT and RWE results.
27		#'
28		#' @examples
29		#' # Example with hazard ratios (log scale)
30		#' smd <- smd_agreement(
31		#' rct_estimate = log(0.87),
32		#' rct_lower = log(0.78),
33		#' rct_upper = log(0.97),
34		#' rwe_estimate = log(0.82),
35		#' rwe_lower = log(0.76),
36		#' rwe_upper = log(0.87)
37		#' )
38		#'
39		#' @references
40		#' For methodology on comparing RCT and RWE results using standardized differences.
41		#'
42		#' @export
43		smd_agreement <- function(rct_estimate,
44		rct_lower,
45		rct_upper,
46		rwe_estimate,
47		rwe_lower,
48		rwe_upper
49		){
50
51		# Input validation
52	19x	required_args <- c("rct_estimate", "rct_lower", "rct_upper",
53	19x	"rwe_estimate", "rwe_lower", "rwe_upper")
54
55		# Check for missing values
56	19x	missing_args <- required_args[sapply(list(rct_estimate, rct_lower, rct_upper,
57	19x	rwe_estimate, rwe_lower, rwe_upper),
58	19x	is.null)]
59	18x	if (length(missing_args) > 0) {
60	!	stop(sprintf("Missing required arguments: %s",
61	!	paste(missing_args, collapse = ", ")))
62		}
63
64		# Check numeric inputs
65	18x	if (!all(sapply(list(rct_estimate, rct_lower, rct_upper,
66	18x	rwe_estimate, rwe_lower, rwe_upper), is.numeric))) {
67	1x	stop("All inputs must be numeric values")
68		}
69
70		# Check bounds
71	17x	if (rct_lower >= rct_upper) {
72	1x	stop("RCT lower bound must be less than upper bound")
73		}
74	16x	if (rwe_lower >= rwe_upper) {
75	!	stop("RWE lower bound must be less than upper bound")
76		}
77
78		# Check if estimate is within bounds
79	16x	if (rct_estimate < rct_lower \|\| rct_estimate > rct_upper) {
80	1x	warning("RCT estimate is outside its confidence interval bounds")
81		}
82	16x	if (rwe_estimate < rwe_lower \|\| rwe_estimate > rwe_upper) {
83	!	warning("RWE estimate is outside its confidence interval bounds")
84		}
85
86		# numerator
87	16x	numerator <- (rct_estimate - rwe_estimate)
88
89		# variances
90	16x	var_rct <- (rct_upper - rct_lower)/(2*1.96)
91	16x	var_rwe <- (rwe_upper - rwe_lower)/(2*1.96)
92
93		# denominator
94	16x	denominator <- sqrt(var_rct^2 + var_rwe^2)
95
96		# Final check for valid calculation
97	16x	if (is.infinite(denominator) \|\| is.nan(denominator)) {
98	!	stop("Invalid calculation: denominator is infinite or NaN")
99		}
100
101		# smd
102	16x	smd <- numerator/denominator
103
104	16x	return(smd)
105
106		}

1		#' Simulates an artifical EHR-derived analysis-ready oncology dataset
2		#'
3		#' @description
4		#' Parameterized function to quickly create an
5		#' EHR-derived analytic cohort for analytic code development.
6		#'
7		#' @details
8		#' This function simulates a cohort of patients with
9		#' oncology data. The cohort is simulated using a Weibull distribution for time to event
10		#' and a logistic distribution for treatment assignment.
11		#' The function also allows for missingness to be imposed on the data.
12		#' The function is parameterized to allow for the number of patients
13		#' to be simulated, the seed for reproducibility, and whether to
14		#' include a patient id variable.
15		#'
16		#' @param n_total integer, number of total patients
17		#' @param seed integer, seed for reproducibility
18		#' @param include_id logical, include a generated patientid variable
19		#' @param imposeNA logical, set covariates to missing
20		#' @param propNA numeric, proportion of missingness, needs to be between 0 and 1
21		#'
22		#' @return data frame with simulated analytic cohort
23		#'
24		#' @export
25		#'
26		#' @examples
27		#' \dontrun{
28		#' library(encore.analytics)
29		#'
30		#' data_miss <- simulate_data(
31		#' n_total = 3500,
32		#' seed = 41,
33		#' include_id = FALSE,
34		#' imposeNA = TRUE,
35		#' propNA = .33
36		#' )
37		#'
38		#' head(data_miss)
39		#'
40		#'}
41
42		simulate_data <- function(n_total = 3500,
43		seed = 42,
44		include_id = TRUE,
45		imposeNA = TRUE,
46		propNA = NULL
47		){
48
49		# check input
50	20x	assertthat::assert_that(inherits(n_total, "numeric"), msg = "<n_total> needs to be an integer")
51	20x	assertthat::assert_that(n_total > 10, msg = "<n_total> needs to be larger than 10 for meaningful output")
52	19x	assertthat::assert_that(inherits(seed, "numeric"), msg = "<seed> needs to be an integer")
53	19x	assertthat::assert_that(inherits(include_id, "logical"), msg = "<seed> needs to be a logical")
54	19x	assertthat::assert_that(inherits(imposeNA, "logical"), msg = "<imposeNA> needs to be a logical")
55	11x	if(!is.null(propNA)) assertthat::assert_that(inherits(propNA, "numeric"), msg = "<propNA> needs to be numeric")
56	11x	if(!is.null(propNA)) assertthat::assert_that(propNA >= 0 & propNA <= 1, msg = "<propNA> needs to be a numeric between 0 and 1")
57	10x	if(!is.null(propNA)) assertthat::assert_that(imposeNA == TRUE, msg = "<propNA> is specified but imposeNA is FALSE")
58
59		# set the seed
60	17x	set.seed(seed)
61
62		# Build the total cohort
63	17x	cohort <- tibble::tibble(
64		# patient id
65	17x	patientid = paste0("Patient", seq(1, n_total, 1)),
66		# age
67	17x	dem_age_index_cont = stats::rnorm(n = n_total, mean = 69, sd = 8),
68		# sex/male
69	17x	dem_sex_cont = stats::rbinom(n = n_total, size = 1, prob = .33),
70		# smoking
71	17x	c_smoking_history = sample(c(TRUE, FALSE), size = n_total, replace = TRUE, prob = c(.44, .56)),
72		# num met sites
73	17x	c_number_met_sites = sample(c(1, 2, 3, 4, 5), size = n_total, replace = TRUE, prob = c(.73, .22, .035, .006, .001)),
74		# c_ecog_cont
75	17x	c_ecog_cont = stats::rbinom(n = n_total, size = 1, prob = .57),
76		# c_stage_initial_dx_cont
77	17x	c_stage_initial_dx_cont = sample(c(4, 3, 2, 1), size = n_total, replace = TRUE, prob = c(.76, 0.2, 0.028, 0.012)),
78		# dem_race
79	17x	dem_race = factor(sample(c("Asian", "White", "Other"), size = n_total, replace = TRUE, prob = c(.36, .62, .02))),
80		# dem_region
81	17x	dem_region = factor(sample(c("Midwest", "Northeast", "South", "West"), size = n_total, replace = TRUE, prob = c(.14, .20, .39, .27))),
82		# dem_ses
83	17x	dem_ses = sample(c(1, 2, 3, 4, 5), size = n_total, replace = TRUE, prob = c(.12, .15, .19, .24, .29)),
84		# c_hemoglobin_g_dl_cont
85	17x	c_hemoglobin_g_dl_cont = stats::rnorm(n = n_total, mean = 12.90, sd = 1.3),
86		# c_urea_nitrogen_mg_dl_cont
87	17x	c_urea_nitrogen_mg_dl_cont = stats::rnorm(n = n_total, mean = 2.71, sd = 0.3),
88		# c_platelets_10_9_l_cont
89	17x	c_platelets_10_9_l_cont = stats::rnorm(n = n_total, mean = 259, sd = 53),
90		# c_calcium_mg_dl_cont
91	17x	c_calcium_mg_dl_cont = stats::rnorm(n = n_total, mean = 2.23, sd = 0.04),
92		# c_glucose_mg_dl_cont
93	17x	c_glucose_mg_dl_cont = stats::rnorm(n = n_total, mean = 4.64, sd = 0.11),
94		# c_lymphocyte_leukocyte_ratio_cont
95	17x	c_lymphocyte_leukocyte_ratio_cont = stats::rnorm(n = n_total, mean = 2.87, sd = 0.17),
96		# c_alp_u_l_cont
97	17x	c_alp_u_l_cont = stats::rnorm(n = n_total, mean = 4.48, sd = 0.13),
98		# c_protein_g_l_cont
99	17x	c_protein_g_l_cont = stats::rnorm(n = n_total, mean = 68, sd = 4),
100		# c_alt_u_l_cont
101	17x	c_alt_u_l_cont = stats::rnorm(n = n_total, mean = 2.89, sd = 0.33),
102		# c_albumin_g_l_cont
103	17x	c_albumin_g_l_cont = stats::rnorm(n = n_total, mean = 40, sd = 3),
104		# c_bilirubin_mg_dl_cont
105	17x	c_bilirubin_mg_dl_cont = stats::rnorm(n = n_total, mean = -0.92, sd = 1.28),
106		# c_chloride_mmol_l_cont
107	17x	c_chloride_mmol_l_cont = stats::rnorm(n = n_total, mean = 102.0, sd = 3),
108		# c_monocytes_10_9_l_cont
109	17x	c_monocytes_10_9_l_cont = stats::rnorm(n = n_total, mean = -0.51, sd = 0.25),
110		# c_eosinophils_leukocytes_ratio_cont
111	17x	c_eosinophils_leukocytes_ratio_cont = stats::rnorm(n = n_total, mean = 0.69, sd = 0.59),
112		# c_ldh_u_l_cont
113	17x	c_ldh_u_l_cont = stats::rnorm(n = n_total, mean = 1.69, sd = 0.05),
114		# c_hr_cont
115	17x	c_hr_cont = stats::rnorm(n = n_total, mean = 4.43, sd = 0.04),
116		# c_sbp_cont
117	17x	c_sbp_cont = stats::rnorm(n = n_total, mean = 4.85, sd = 0.09),
118		# c_oxygen_cont
119	17x	c_oxygen_cont = stats::rnorm(n = n_total, mean = 97.00, sd = 0.01),
120		# c_neutrophil_lymphocyte_ratio_cont
121	17x	c_neutrophil_lymphocyte_ratio_cont = stats::rnorm(n = n_total, mean = 1.29, sd = 0.38),
122		# c_bmi_cont
123	17x	c_bmi_cont = stats::rnorm(n = n_total, mean = 3.23, sd = 0.13),
124		# c_ast_alt_ratio_cont
125	17x	c_ast_alt_ratio_cont = stats::rnorm(n = n_total, mean = 0.12, sd = 0.29),
126		# c_time_dx_to_index
127	17x	c_time_dx_to_index = stats::rnorm(n = n_total, mean = 44, sd = 17),
128		# c_de_novo_mets_dx
129	17x	c_de_novo_mets_dx = sample(c(TRUE, FALSE), size = n_total, replace = TRUE, prob = c(.79, .21)),
130		# c_height_cont
131	17x	c_height_cont = stats::rnorm(n = n_total, mean = 1.65, sd = 0.08),
132		# c_weight_cont
133	17x	c_weight_cont = stats::rnorm(n = n_total, mean = 69, sd = 11),
134		# c_dbp_cont
135	17x	c_dbp_cont = stats::rnorm(n = n_total, mean = 76, sd = 6),
136		# c_year_index
137	17x	c_year_index = factor(sample(c("<2018", "2018+"), size = n_total, replace = TRUE, prob = c(.95, .05)))
138		)
139
140		# assign treatment probabilities
141	17x	cohort <- cohort \|>
142	17x	dplyr::mutate(
143		# treat (to simplify categorical variables don't get betas)
144	17x	treat = stats::rbinom(
145	17x	n = n_total,
146	17x	size = 1,
147	17x	prob = locfit::expit(
148	17x	-5.8 +
149	17x	dem_age_index_cont * log(1.0071) +
150	17x	dem_sex_cont * log(0.9804) +
151	17x	c_smoking_history * log(0.7065) +
152	17x	c_number_met_sites * log(0.9982) +
153	17x	c_hemoglobin_g_dl_cont * log(1.0483) +
154	17x	c_urea_nitrogen_mg_dl_cont * log(0.9198) +
155	17x	c_platelets_10_9_l_cont * log(0.9998) +
156	17x	c_calcium_mg_dl_cont * log(0.9118) +
157	17x	c_glucose_mg_dl_cont * log(1.0954) +
158	17x	c_lymphocyte_leukocyte_ratio_cont * log(1.1192) +
159	17x	c_alp_u_l_cont * log(1.3126) +
160	17x	c_protein_g_l_cont * log(1.0002) +
161	17x	c_alt_u_l_cont * log(0.9679) +
162	17x	c_albumin_g_l_cont * log(1.0296) +
163	17x	c_bilirubin_mg_dl_cont * log(0.9126) +
164	17x	c_chloride_mmol_l_cont * log(1.0076) +
165	17x	c_monocytes_10_9_l_cont * log(1.3919) +
166	17x	c_eosinophils_leukocytes_ratio_cont * log(0.9567) +
167	17x	c_ldh_u_l_cont * log(0.8804) +
168	17x	c_hr_cont * log(1.2264) +
169	17x	c_sbp_cont * log(1.0647) +
170	17x	c_oxygen_cont * log(1.00064) +
171	17x	c_ecog_cont * log(0.7857) +
172	17x	c_neutrophil_lymphocyte_ratio_cont * log(1.0758) +
173	17x	c_bmi_cont * log(1.061) +
174	17x	c_ast_alt_ratio_cont * log(1.0756) +
175	17x	c_stage_initial_dx_cont * log(1.2177) +
176	17x	c_time_dx_to_index * log(1.0001)
177		)
178		)
179		)
180
181		#summary(cohort$treat)
182
183		# simulate HTE by race
184
185		## Asian patients
186	17x	cohort_Asian <- cohort \|>
187	17x	dplyr::filter(dem_race == "Asian")
188
189		# assign betas for hazard model
190	17x	betas_os_Asian <- c(
191	17x	treat = log(1),
192	17x	dem_sex_cont = log(0.79),
193	17x	dem_age_index_cont = log(1.02),
194	17x	c_smoking_history = log(.70),
195	17x	c_ecog_cont = log(.7)
196		)
197
198	17x	set.seed(seed)
199	17x	cohort_outcome_Asian <- cohort_Asian \|>
200	17x	dplyr::bind_cols(
201	17x	simsurv::simsurv(
202	17x	dist = "weibull",
203	17x	gammas = 1.2, # shape parameter
204	17x	lambdas = 0.01, # scale parameter
205	17x	betas = betas_os_Asian, # betas for the model
206	17x	x = cohort_Asian, # data frame with covariates
207	17x	maxt = 120
208		)
209		) \|>
210	17x	dplyr::select(-id) \|>
211	17x	dplyr::rename(
212	17x	fu_itt_months = eventtime,
213	17x	death_itt = status
214		)
215
216		## Non-Asian patients
217	17x	cohort_nonAsian <- cohort \|>
218	17x	dplyr::filter(dem_race != "Asian")
219
220	17x	betas_os_nonAsian <- c(
221	17x	treat = log(0.54),
222	17x	dem_sex_cont = log(0.79),
223	17x	dem_age_index_cont = log(1.02),
224	17x	c_smoking_history = log(.70),
225	17x	c_ecog_cont = log(.7)
226		)
227
228	17x	set.seed(seed)
229	17x	cohort_outcome_nonAsian <- cohort_nonAsian \|>
230	17x	dplyr::bind_cols(
231	17x	simsurv::simsurv(
232	17x	dist = "weibull",
233	17x	gammas = 1.2,
234	17x	lambdas = 0.01,
235	17x	betas = betas_os_nonAsian,
236	17x	x = cohort_nonAsian,
237	17x	maxt = 120
238		)
239		) \|>
240	17x	dplyr::select(-id) \|>
241	17x	dplyr::rename(
242	17x	fu_itt_months = eventtime,
243	17x	death_itt = status
244		)
245
246		# combine Asian and non-Asian
247	17x	cohort_outcome <- rbind(cohort_outcome_Asian, cohort_outcome_nonAsian)
248
249		# summary(cohort_outcome$death_itt)
250		# summary(cohort_outcome$fu_itt_months)
251		# hist(cohort_outcome$fu_itt_months)
252
253	1x	if(!include_id) cohort_outcome <- cohort_outcome \|> dplyr::select(-patientid)
254
255	17x	if(imposeNA){
256
257	9x	varComplete <- c("patientid", "treat", "c_year_index", "fu_itt_months", "death_itt", "dem_age_index_cont", "dem_sex_cont")
258	9x	varNA <- setdiff(names(cohort_outcome), varComplete)
259
260		# define column numbers for pattern and weight determination
261	9x	colsNA <- which(colnames(cohort_outcome) %in% varNA)
262
263		# define missingness pattern
264	9x	default_pattern <- rep(1, ncol(cohort_outcome))
265	9x	pattern <- replace(default_pattern, colsNA, 0)
266
267	9x	set.seed(seed)
268	9x	cohort_outcome <- mice::ampute(
269	9x	data = cohort_outcome,
270	9x	prop = propNA,
271	9x	patterns = pattern,
272	9x	mech = "MCAR"
273	9x	)$amp
274
275		}
276
277	17x	return(cohort_outcome)
278
279		}
280

1		#' Estimate raking weights for a mimids or wimids object
2		#'
3		#' @description
4		#' Function estimates raking weights for multiple imputed and matched (\code{mimids})
5		#' or weighted (\code{wimids}) datasets. That is to match the distributions of
6		#' certain variables in the imputed and matched/weighted datasets to a distribution of
7		#' a target population (e.g., clinical trial population).
8		#'
9		#' @details
10		#' The function requires an object of class mimids or wimids (\code{x}), which is the output
11		#' of a workflow that requires imputing multiple (m) datasets using mice or amelia
12		#' and matching or weighting each imputed dataset via the MatchThem package
13		#' (see examples).
14		#'
15		#' The function additionally requires a list of target distributions (\code{targets}) for each variable
16		#' that is considered for the raking procedure. The list should contain named vectors with the target
17		#' distributions for each variable and the names of the vectors should match the variable names in the imputed datasets.
18		#'
19		#' In brief, the raking procedure iteratively adjusts the weights to make the weighted sample percentages match the target
20		#' population percentages for the selected variables.It does this by multiplying the current weight for each case by a
21		#' factor based on the ratio of the target population proportion to the weighted sample proportion for a given category.
22		#' This adjustment is performed sequentially for each category of each selected variable. Because adjusting for one
23		#' variable can disrupt the match for previous variables, the process is repeated through all selected variables in cycles.
24		#' This iterative process minimizes the Kullback-Leibler (KL) divergence and continues until the weighted sample proportions
25		#' match the target population proportions for all categories ("full convergence"), or until no further change occurs.
26		#'
27		#' The function follows the following logic:
28		#' 1. Extract the ith imputed dataset from the mimids or wimids object
29		#' 2. Create a temporary case/patient ID
30		#' 3. Apply the \code{\link[anesrake]{anesrake}} function to the ith imputed dataset
31		#' 4. Create a temporary dataframe with the case ID and replace the initial weights with the updated raking weights
32		#' 5. Merge the temporary dataframe with the ith imputed dataset
33		#' 6. Drop the temporary case ID
34		#' 7. Return the ith imputed dataset with the raking weights
35		#'
36		#' The function returns a list of data frames with the updated raking weights. These
37		#' updated raking weights overwrite in each data frame the existing \code{weights} column.
38		#' This column can then be used in a downstream analysis (e.g., Kaplan-Meier, Cox proportional hazards regression).
39		#'
40		#' @seealso
41		#' \code{\link[anesrake]{anesrake}} \code{\link[MatchThem]{matchthem}} \code{\link[MatchThem]{weightthem}}
42		#'
43		#' @param x imputed and matched (mimids) or weighted (wimids) object
44		#' @param targets list of all target values for the raking procedure
45		#'
46		#' @return list of data frames with updated raking weights (\code{raking_weights})
47		#'
48		#' @md
49		#'
50		#' @export
51		#'
52		#' @examples
53		#'
54		#' library(encore.analytics)
55		#' library(mice)
56		#' library(dplyr)
57		#' library(MatchThem)
58		#' library(survival)
59		#'
60		#' # simulate a cohort with 1,000 patients with 20% missing data
61		#' data <- simulate_data(
62		#' n = 500,
63		#' imposeNA = TRUE,
64		#' propNA = 0.2
65		#' ) \|>
66		#' # anesrake works best with factor variables
67		#' mutate(c_smoking_history = factor(ifelse(c_smoking_history == TRUE, "Current/former", "Never")))
68		#'
69		#' # impute the data (create mids object)
70		#' set.seed(42)
71		#' mids <- mice(data, m = 5, print = FALSE)
72		#'
73		#' # define covariates for propensity score model
74		#' covariates <- data \|>
75		#' select(starts_with("c_"), starts_with("dem_")) \|>
76		#' colnames()
77		#'
78		#' # define propensity score model
79		#' fit <- as.formula(paste("treat ~", paste(covariates, collapse = " + ")))
80		#'
81		#' # match patients within each imputed dataset
82		#' mimids <- matchthem(
83		#' formula = fit,
84		#' datasets = mids,
85		#' approach = 'within',
86		#' method = 'nearest'
87		#' )
88		#'
89		#' smoker_target <- c(.35, .65)
90		#' names(smoker_target) <- c("Current/former", "Never")
91		#'
92		#' # summarize target distributions in a named list vector
93		#' targets <- list(smoker_target)
94		#' names(targets) <- c("c_smoking_history")
95		#'
96		#' # estimate raking weights
97		#' mirwds <- raking_weights(
98		#' x = mimids,
99		#' targets = targets
100		#' )
101		#'
102		raking_weights <- function(x,
103		targets = NULL
104		){
105
106		# input checks----
107
108		# check if object is a mimids or wimids object
109	6x	assertthat::assert_that(inherits(x, c("mimids", "wimids")), msg = "<object> needs to be a mimids or wimids object")
110		# check if targets is a list
111	5x	assertthat::assert_that(is.list(targets), msg = "<targets> needs to be a list")
112		# check if targets is a named list
113	3x	assertthat::assert_that(all(names(targets) != ""), msg = "<targets> needs to be a named list")
114		# check if targets is a list of named vectors
115	3x	assertthat::assert_that(all(sapply(targets, is.numeric)), msg = "<targets> needs to be a list of named vectors")
116		# check if targets is a list of named vectors
117	2x	assertthat::assert_that(all(sapply(targets, function(x) all(names(x) != ""))), msg = "<targets> needs to be a list of named vectors")
118
119		# extract imputed and matched/weighted datasets and weights ----
120
121		# create a function to indicate which cases should be re-weighted
122		# since we are only interested in matched/weighted cases
123	2x	estimate_raking_weights <- function(i){
124
125		# next, we create a list of datasets to apply the raking procedure one-by-one
126		# action = "all" : produces a mild object (list) of the multiply imputed datasets
127		# all = FALSE : do NOT include observations with a zero estimated weight
128		# include = FALSE : do NOT include original data with the missing values
129
130		# Important: the all = parameter needs to be set to FALSE
131		# since otherwise unmatched patients or those zero weight
132		# will be included, too!
133	10x	data <- MatchThem::complete(x, action = i, all = FALSE, include = FALSE)
134
135		# create a temporary case ID
136	10x	data <- data \|>
137	10x	dplyr::mutate(caseid = seq(1, nrow(data), 1))
138
139		# apply anesrake function to re-weight -----------------------------------
140	10x	anesrake_out <- anesrake::anesrake(
141		# target distributions for raking procedure
142	10x	inputter = targets,
143		# data to be weighted
144	10x	dataframe = data,
145		# patient identifier
146	10x	caseid = data$caseid,
147		# other weights that should be accounted for before re-weighting is conducted
148		# for matching this is 0 (unmatched) and 1 (matched) and for weighting this is the actual weights
149	10x	weightvec = data$weights,
150		# no verbosity
151	10x	verbose = FALSE
152		)
153
154		# create a temporary dataframe with id and sampling/re-weighting weights
155	10x	caseweights <- data.frame(caseid = anesrake_out$caseid, weights = anesrake_out$weightvec)
156
157	10x	data_return <- data \|>
158		# remove "old" weights with new weights
159	10x	dplyr::select(-weights) \|>
160	10x	dplyr::left_join(caseweights, by = "caseid") \|>
161		# now we can drop the temporary id
162	10x	dplyr::select(-caseid)
163
164	10x	return(data_return)
165
166		}
167
168		# return list of imputed, matched/weighted and re-weighted datasets (mirwds)
169	2x	mirwds <- lapply(
170	2x	X = 1:length(x$models),
171	2x	FUN = estimate_raking_weights
172		)
173
174	2x	return(mirwds)
175
176		}

1		#' Manually fit and pool Cox proportional hazards model results from multiple imputed datasets
2		#'
3		#' @description
4		#' This function manually fits and pools results from Cox proportional hazards models using
5		#' a list of imputed datasets. It leverages the `mice::as.mira` and `mice::pool` functions to
6		#' ensure proper pooling of results across multiple imputations using Rubin's rules.
7		#'
8		#' @details
9		#' The function requires a list of imputed data frames with weights and optional cluster (matching) information,
10		#' as well as a formula for the Cox proportional hazards model. The data frames must include a column named
11		#' \code{weights}, and optionally a column named \code{subclass} (for matched datasets to indicate cluster membership).
12		#'
13		#' This function is particularly useful when working with imputed datasets that are not in the form of
14		#' \code{mimids} or \code{wimids} objects, such as when intermediate steps like raking weights
15		#' (via \code{raking_weights}) are applied. It provides a flexible way to fit and pool Cox models
16		#' while ensuring compatibility with Rubin's rules for multiple imputation.
17		#'
18		#' The function follows these steps:
19		#' 1. Fit a Cox proportional hazards model to each imputed dataset. If a \code{subclass} column is present,
20		#' it is used as a cluster variable for matched pairs.
21		#' 2. Convert the list of fitted models into a \code{mira} object using \code{mice::as.mira}.
22		#' 3. Pool the results using \code{mice::pool}, which applies Rubin's rules for combining estimates
23		#' and variances across imputations.
24		#' 4. Format the pooled results, including exponentiating the hazard ratios and calculating confidence intervals.
25		#'
26		#' @seealso
27		#' \code{\link[survival]{coxph}}, \code{\link[mice]{pool}}, \code{\link[mice]{as.mira}},
28		#' \code{\link[MatchThem]{matchthem}}, \code{\link[MatchThem]{weightthem}}
29		#'
30		#' @param x A list of imputed datasets with weights or raking weights (e.g., from \code{\link[encore.analytics]{raking_weights}})
31		#' and optional cluster information (for matched datasets).
32		#' @param surv_formula A formula for the Cox proportional hazards model (default is \code{Surv(fu_itt_months, death_itt) ~ treat}).
33		#'
34		#' @return A data frame containing the pooled results, including hazard ratios, confidence intervals, and p-values.
35		#'
36		#' @md
37		#'
38		#' @export
39		#'
40		#' @examples
41		#' library(encore.analytics)
42		#' library(mice)
43		#' library(MatchThem)
44		#'
45		#' # Simulate a cohort with 500 patients and 20% missing data
46		#' data <- simulate_data(
47		#' n = 500,
48		#' imposeNA = TRUE,
49		#' propNA = 0.2
50		#' )
51		#'
52		#' # Impute the data
53		#' set.seed(42)
54		#' mids <- mice(data, m = 5, print = FALSE)
55		#'
56		#' # Fit a propensity score model
57		#' fit <- as.formula(treat ~ dem_age_index_cont + dem_sex_cont + c_smoking_history)
58		#'
59		#' # Weight patients within each imputed dataset
60		#' wimids <- weightthem(
61		#' formula = fit,
62		#' datasets = mids,
63		#' approach = "within",
64		#' method = "glm",
65		#' estimand = "ATO"
66		#' )
67		#'
68		#' # Create a list of imputed and weighted datasets
69		#' wimids_list <- MatchThem::complete(wimids, action = "all", all = FALSE, include = FALSE)
70		#'
71		#' # Define a survival model formula
72		#' cox_fit <- as.formula(survival::Surv(fu_itt_months, death_itt) ~ treat)
73		#'
74		#' # Fit and pool Cox proportional hazards model results
75		#' cox_pooling(wimids_list, surv_formula = cox_fit)
76		cox_pooling <- function(x,
77		surv_formula = stats::as.formula(survival::Surv(fu_itt_months, death_itt) ~ treat)
78		){
79
80		# input checks----
81
82		# check if object is a list
83	9x	assertthat::assert_that(inherits(x, "list") \| inherits(x, "mild"), msg = "<cox_results_list> needs to be a list or mild object")
84		# check if surv_formula is a formula
85	8x	assertthat::assert_that(inherits(surv_formula, "formula"), msg = "<surv_formula> needs to be a formula")
86		# check if weights_col is in the data
87	7x	df_colnames <- colnames(x[[1]])
88	7x	assertthat::assert_that("weights" %in% df_colnames, msg = "<weights> not in <x>")
89
90		# Step 1: fit Cox model to each imputed dataset----
91	6x	cox_fit <- function(i){
92
93		# if cluster_col is in the data (matched datasets)
94	30x	if("subclass" %in% df_colnames){
95
96	10x	cox <- survival::coxph(
97	10x	data = x[[i]],
98	10x	formula = surv_formula,
99	10x	weights = weights,
100	10x	cluster = subclass,
101	10x	robust = TRUE
102		)
103
104		}else{
105
106	20x	cox <- survival::coxph(
107	20x	data = x[[i]],
108	20x	formula = surv_formula,
109	20x	weights = weights,
110	20x	robust = TRUE
111		)
112
113		}
114
115	30x	return(cox)
116
117
118		}
119
120		# loop cox_fit function over each imputed dataset
121	6x	cox_results_list <- lapply(
122	6x	X = 1:length(x),
123	6x	FUN = cox_fit
124		)
125
126		# Step 2: extract results into a format suitable for pooling
127		# convert each model into a "mira" object (required for pooling)
128	6x	mira_object <- mice::as.mira(cox_results_list)
129
130		# Step 3: pool the results using Rubin's rules
131	6x	pooled_results <- mice::pool(mira_object)
132
133		# Step 4: format the pooled results
134	6x	cox_result <- broom::tidy(pooled_results, exponentiate = TRUE, conf.int = TRUE)
135
136	6x	return(cox_result)
137
138
139		}

1		#' Wrapper around \code{gtsummary::tbl_summary()} to create a beautiful Table 1 quickly
2		#'
3		#' @description
4		#' Builds a table 1 using the \code{gtsummary} package. The function takes a one-row-per-patient dataframe
5		#' and creates a summary table of covariates stratified by treatment group.
6		#'
7		#' @details
8		#' Wrapper for \code{\link[gtsummary]{tbl_summary}}.
9		#'
10		#' The function \code{create_table1} is a wrapper around the \code{gtsummary::tbl_summary()} function.
11		#' It is designed to create a summary table of covariates stratified by treatment group.
12		#' The function requires a one-row-per-patient dataframe and creates a summary table of covariates stratified by treatment group.
13		#'
14		#' @importFrom smd smd
15		#'
16		#' @param x dataframe with individual-level patient data in a one-row-per-patient format with treatment stratification variable and covariates to be displayed in the Table 1
17		#' @param covariates character vector of columns/covariate names to be displayed in Table 1
18		#' @param covariates_labels named character vector or list of formulas specifying variables labels of covariate-label pairs to display in table
19		#' @param treat character specifying column name of treatment variable
20		#' @param explicit_na_categorical logical, should missings in categorical variables be explicitly included as a separate category (default is TRUE)
21		#'
22		#' @return object of class "tbl_summary" "gtsummary"
23		#'
24		#' @export
25		#'
26		#' @examples
27		#' \dontrun{
28		#' library(encore.analytics)
29		#'
30		#' # simulate a cohort with 1,000 patients with 20% missing data
31		#' data <- simulate_data(
32		#' n = 1000,
33		#' imposeNA = TRUE,
34		#' propNA = 0.2
35		#' )
36		#'
37		#' # create a Table 1
38		#' create_table1(
39		#' x = data,
40		#' covariates = c("dem_age_index_cont", "dem_sex_cont", "c_smoking_history"),
41		#' covariates_labels = list(
42		#' "dem_age_index_cont" = "Age",
43		#' "dem_sex_cont" = "Sex",
44		#' "c_smoking_history" = "Smoking history"
45		#' ),
46		#' treat = "treat"
47		#' )
48		#'}
49		create_table1 <- function(x = NULL,
50		covariates = NULL,
51		covariates_labels = NULL,
52		treat = "treat",
53		explicit_na_categorical = TRUE
54		){
55
56	8x	install_on_demand("cardx")
57	8x	install_on_demand("smd")
58
59		# check input
60	8x	assertthat::assert_that(any(class(x) %in% c("data.frame", "tibble")), msg = "<x> is not a data.frame or tibble")
61	7x	assertthat::assert_that(all(covariates %in% colnames(x)), msg = "some or all of <covariates> not in <x>")
62	6x	assertthat::assert_that(is.list(covariates_labels) \| is.null(covariates_labels), msg = "<covariates_labels> needs to be a named character vector or list of formulas specifying variables labels")
63	5x	assertthat::assert_that(length(covariates_labels) == length(covariates) \| is.null(covariates_labels), msg = "<covariates_labels> must be a named character vector or list of formulas specifying variables labels")
64	5x	assertthat::assert_that(length(treat) == 1, msg = "<treat> must be a single character string")
65	5x	assertthat::assert_that(treat %in% colnames(x), msg = "<treat> not in <x>")
66	4x	assertthat::assert_that(is.logical(explicit_na_categorical), msg = "<explicit_na_categorical> must be a logical TRUE or FALSE")
67	3x	assertthat::assert_that(length(covariates) > 0, msg = "<covariates> must be a non-empty character vector")
68
69		# assign treatment variable
70		# and subset columns
71	3x	x[["strata"]] <- x[[treat]]
72	3x	x <- x[c(covariates, "strata")]
73
74	3x	if(explicit_na_categorical){
75
76		# encode NA explicitly as "Missing" (only applicable to categorical)
77	3x	x <- x \|>
78		# logicals into categorical
79	3x	dplyr::mutate(
80	3x	dplyr::across(
81	3x	dplyr::where(function(.x) is.logical(.x)),
82	3x	.fns = function(.x) factor(.x, levels = c("TRUE", "FALSE", "Missing"))
83		)
84		) \|>
85		# for all categorical, assign a "Missing" category
86	3x	dplyr::mutate(
87	3x	dplyr::across(
88	3x	dplyr::where(function(.x) is.character(.x) \| is.factor(.x)),
89	3x	.fns = function(.x) forcats::fct_na_value_to_level(.x, level = "Missing")
90		)
91		) \|>
92		# drop "Missing" level for covariates which are fully observed/have no NA
93	3x	droplevels()
94		}
95
96	3x	table1 <- x \|>
97	3x	gtsummary::tbl_summary(
98	3x	by = "strata",
99	3x	missing = "ifany",
100	3x	missing_text = "Missing",
101	3x	label = covariates_labels
102		) \|>
103	3x	gtsummary::add_difference(test = dplyr::everything() ~ "smd") \|>
104	3x	gtsummary::modify_column_hide(conf.low) \|>
105	3x	gtsummary::add_overall() \|>
106	3x	gtsummary::modify_header(
107	3x	label ~ "Patient characteristic",
108	3x	stat_0 ~ "Total <br> N = {N}",
109	3x	stat_1 ~ "{level} <br> N = {n} <br> ({style_percent(p, digits=1)}%)",
110	3x	stat_2 ~ "{level} <br> N = {n} <br> ({style_percent(p, digits=1)}%)"#,
111		#test_result ~ "SMD"
112		) \|>
113	3x	gtsummary::modify_spanning_header(c("stat_1", "stat_2") ~ "Treatment received")
114
115	3x	return(table1)
116
117		}

1		#' Calculate Agreement Metrics Between RCT and RWE Results
2		#'
3		#' @description Creates a formatted table comparing results from randomized controlled trials (RCTs)
4		#' with real-world evidence (RWE) emulation studies by calculating agreement metrics.
5		#'
6		#' @details
7		#' Calculates three types of agreement metrics:
8		#' - Statistical significance agreement: Same direction and statistical significance
9		#' - Estimate agreement: RWE estimate within RCT confidence interval
10		#' - SMD agreement: Standardized mean difference below threshold (default 1.96)
11		#'
12		#' All estimates should be hazard ratios (HR) and must be positive.
13		#' Estimates are log-transformed for SMD calculation.
14		#'
15		#' @importFrom tidyr pivot_wider pivot_longer
16		#' @importFrom smd smd
17		#'
18		#' @param x A data.frame or tibble containing RCT and RWE results (in wide format, see examples).
19		#' Must contain these columns:
20		#'
21		#' - rct_estimate = Point estimate from the RCT
22		#' - rct_lower = Lower confidence limit of the RCT result
23		#' - rct_upper = Upper confidence limit of the RCT result
24		#' - rwe_estimate = Point estimate from the RWE emulation study
25		#' - rwe_lower = Lower confidence limit of the RWE result
26		#' - rwe_upper = Upper confidence limit of the RWE result
27		#'
28		#' @param analysis_col Character. Name of column identifying different analyses
29		#' @param group_col Character. Optional. Name of column for grouping results (e.g., database)
30		#' @param estimate_label Character. Label for estimates in table header. Default: "HR (95% CI)"
31		#' @param smd_threshold Numeric. Threshold for SMD agreement. Default: 1.96 (α=0.05)
32		#'
33		#' @return A gt table object with formatted agreement metrics
34		#'
35		#' @md
36		#'
37		#' @examples
38		#' # Example 1: One analysis per database with grouping
39		#'
40		#'x <- tibble::tribble(
41		#'~Analysis, ~Database, ~rct_estimate, ~rct_lower, ~rct_upper, ~rwe_estimate, ~rwe_lower, ~rwe_upper,
42		#'"Main analysis", "Database 1", 0.87, 0.78, 0.97, 0.82, 0.76, 0.87,
43		#'"Main analysis", "Database 2", 0.5, 0.4, 0.6, 2.0, 1.8, 2.2,
44		#'"Main analysis", "Database 3", 0.8, 0.7, 0.9, 1.5, 1.4, 1.6
45		#')
46		#'
47		#' agreement_metrics(x, analysis_col = "Analysis", group_col = "Database")
48		#'
49		#' # Example 2: Three analyses, one database without grouping
50		#'
51		#' x <- tibble::tribble(
52		#' ~Analysis, ~rct_estimate, ~rct_lower, ~rct_upper, ~rwe_estimate, ~rwe_lower, ~rwe_upper,
53		#' "Main analysis", 0.87, 0.78, 0.97, 0.82, 0.76, 0.87,
54		#' "Sensitivity analysis 1", 0.5, 0.4, 0.6, 2.0, 1.8, 2.2,
55		#' "Sensitivity analysis 2", 0.8, 0.7, 0.9, 1.5, 1.4, 1.6
56		#' )
57		#'
58		#' agreement_metrics(x, analysis_col = "Analysis")
59		#'
60		#' # Example 3: Three analyses per database with grouping
61		#' x <- tibble::tribble(
62		#' ~Analysis, ~Database, ~rct_estimate, ~rct_lower, ~rct_upper, ~rwe_estimate, ~rwe_lower, ~rwe_upper,
63		#' "Main analysis", "Database 1", 0.87, 0.78, 0.97, 0.82, 0.76, 0.87,
64		#' "Main analysis", "Database 2", 0.5, 0.4, 0.6, 2.0, 1.8, 2.2,
65		#' "Main analysis", "Database 3", 0.8, 0.7, 0.9, 1.5, 1.4, 1.6,
66		#' "Sensitivity analysis 1", "Database 1", 0.5, 0.4, 0.6, 2.0, 1.8, 2.2,
67		#' "Sensitivity analysis 1", "Database 2", 0.8, 0.7, 0.9, 1.5, 1.4, 1.6,
68		#' "Sensitivity analysis 1", "Database 3", 0.87, 0.78, 0.97, 0.82, 0.76, 0.87,
69		#' "Sensitivity analysis 2", "Database 1", 0.87, 0.78, 0.97, 0.82, 0.76, 0.87,
70		#' "Sensitivity analysis 2", "Database 2", 0.5, 0.4, 0.6, 2.0, 1.8, 2.2,
71		#' "Sensitivity analysis 2", "Database 3", 0.8, 0.7, 0.9, 1.5, 1.4, 1.6
72		#' )
73		#'
74		#' agreement_metrics(x, analysis_col = "Analysis", group_col = "Database")
75		#'
76		#' @export
77		agreement_metrics <- function(x,
78		analysis_col,
79		group_col = NULL, # subgroup, e.g, database
80		estimate_label = "HR (95% CI)",
81		smd_threshold = 1.96 # customizable threshold for SMD agreement
82		){
83
84		# input checks
85	10x	assertthat::assert_that(any(class(x) %in% c("data.frame", "tibble")), msg = "<x> is not a data.frame or tibble")
86	10x	assertthat::assert_that(all(c("rct_estimate", "rct_lower", "rct_upper", "rwe_estimate", "rwe_lower", "rwe_upper") %in% colnames(x)), msg = "<x> does not contain all required columns")
87	9x	assertthat::assert_that(!is.null(analysis_col), msg = "<analysis_col> cannot be NULL")
88	9x	assertthat::assert_that(analysis_col %in% colnames(x), msg = sprintf("<analysis_col> '%s' not found in <x>", analysis_col))
89	1x	if(!is.null(group_col)) assertthat::assert_that(group_col %in% colnames(x), msg = "<group_col> not in <x>")
90
91		# check for non-positive values
92	8x	assertthat::assert_that(all(x$rct_estimate > 0), msg = "RCT estimates must be positive for hazard ratios")
93	7x	assertthat::assert_that(all(x$rwe_estimate > 0), msg = "RWE estimates must be positive for hazard ratios")
94
95		# if no smd_value present in the results table, calculate it
96	7x	if(!"smd_value" %in% colnames(x)){
97	7x	x <- x \|>
98	7x	dplyr::rowwise() \|>
99	7x	dplyr::mutate(
100	7x	smd_value = tryCatch({
101	10x	smd_agreement(
102	10x	rct_estimate = log(rct_estimate),
103	10x	rct_lower = log(rct_lower),
104	10x	rct_upper = log(rct_upper),
105	10x	rwe_estimate = log(rwe_estimate),
106	10x	rwe_lower = log(rwe_lower),
107	10x	rwe_upper = log(rwe_upper)
108		)
109	7x	}, error = function(e) {
110	!	warning(sprintf("SMD calculation failed: %s", e$message))
111	7x	NA_real_
112		})
113		) \|>
114	7x	dplyr::ungroup()
115		}
116
117		# calculate agreement metrics
118	7x	x <- x \|>
119	7x	dplyr::mutate(
120		# significance agreement with additional categories
121	7x	significance_agreement = dplyr::case_when(
122		# superiority
123	7x	rct_estimate < 1 & rct_upper < 1 & rwe_estimate < 1 & rwe_upper < 1 ~ "Yes",
124		# non-inferiority
125	7x	rct_estimate < 1 & rct_upper >= 1 & rwe_estimate < 1 & rwe_upper >= 1 ~ "Yes",
126		# inferiority
127	7x	rct_estimate >= 1 & rct_lower >= 1 & rwe_estimate >= 1 & rwe_upper >= 1 ~ "Yes",
128		# all other cases
129	7x	TRUE ~ "No"
130		),
131
132		# estimate agreement
133	7x	estimate_agreement = dplyr::if_else(
134	7x	rwe_estimate >= rct_lower & rwe_estimate <= rct_upper, "Yes", "No"
135		),
136
137		# smd agreement using parameter
138	7x	smd_agreement = dplyr::case_when(
139	7x	is.na(smd_value) ~ "NA",
140	7x	abs(smd_value) < smd_threshold ~ "Yes",
141	7x	TRUE ~ "No"
142		)
143		)
144
145		# format table for display
146	7x	x_format <- x \|>
147	7x	dplyr::mutate(dplyr::across(where(is.numeric), ~ format(.x, digits = 2, nsmall = 2))) \|>
148	7x	dplyr::mutate(
149	7x	RCT = glue::glue("{rct_estimate} ({rct_lower} - {rct_upper})"),
150	7x	RWE = glue::glue("{rwe_estimate} ({rwe_lower} - {rwe_upper})"),
151	7x	smd_agreement = glue::glue("{smd_agreement} ({smd_value})")
152		) \|>
153	7x	dplyr::select(dplyr::all_of(c(analysis_col, group_col)), "RCT", "RWE", "significance_agreement", "estimate_agreement", "smd_agreement")
154
155		# conditional grouping
156	7x	if (!is.null(group_col)) {
157	1x	x_format <- x_format \|>
158	1x	dplyr::group_by(dplyr::across(dplyr::all_of(group_col)))
159		}
160
161		# create and format gt table
162	7x	x_gt <- x_format \|>
163	7x	gt::gt() \|>
164	7x	gt::tab_spanner(
165	7x	label = estimate_label,
166	7x	columns = c("RCT", "RWE")
167		) \|>
168	7x	gt::cols_label(
169	7x	significance_agreement = gt::md("Statistical <br> significance <br> agreement"),
170	7x	estimate_agreement = gt::md("Estimate <br> agreement"),
171	7x	smd_agreement = "SMD"
172		)
173
174		# apply styles for agreement columns
175	7x	for (col in c("significance_agreement", "estimate_agreement", "smd_agreement")) {
176	21x	x_gt <- x_gt \|>
177	21x	gt::tab_style(
178	21x	style = gt::cell_text(color = "darkgreen"),
179	21x	locations = gt::cells_body(
180	21x	columns = !!col,
181	21x	rows = if (col == "smd_agreement") {
182	7x	stringr::str_detect(x_format[[col]], "Yes")
183		} else {
184	14x	x_format[[col]] == "Yes"
185		}
186		)
187		) \|>
188	21x	gt::tab_style(
189	21x	style = gt::cell_text(color = "darkred"),
190	21x	locations = gt::cells_body(
191	21x	columns = !!col,
192	21x	rows = if (col == "smd_agreement") {
193	7x	stringr::str_detect(x_format[[col]], "No")
194		} else {
195	14x	x_format[[col]] == "No"
196		}
197		)
198		)
199		}
200
201		# footnote estimate label
202	7x	if(estimate_label == "HR (95% CI)") footnote_label <- "HR = Hazard ratio"
203
204		# Add bold styling for headers
205	7x	x_gt <- x_gt \|>
206	7x	gt::tab_style(
207	7x	style = gt::cell_text(weight = "bold"),
208	7x	locations = gt::cells_column_labels()
209		) \|>
210	7x	gt::tab_style(
211	7x	style = gt::cell_text(weight = "bold"),
212	7x	locations = gt::cells_column_spanners()
213		) \|>
214	7x	gt::tab_style(
215	7x	style = gt::cell_text(weight = "bold"),
216	7x	locations = gt::cells_row_groups()
217		) \|>
218	7x	gt::tab_footnote(
219	7x	glue::glue("Abbreviations: CI = Confidence interval, {footnote_label}, RCT = Randomized controlled trial, RWE = Real-world evidence, SMD = standardized mean difference (based on log hazard ratios)")
220		)
221
222	7x	return(x_gt)
223		}

1		#' Pooled Kaplan-Meier estimate and survival curve
2		#'
3		#' @description
4		#' Computes pooled median survival Kaplan-Meier estimates using Rubin's rule
5		#' and outputs corresponding Kaplan-Meier curve across imputed and
6		#' matched/weighted datasets
7		#'
8		#' @details
9		#' The function requires an object of class mimids or wimids (\code{x}), which is the output
10		#' of a workflow that requires imputing multiple (m) datasets using mice or amelia
11		#' and matching or weighting each imputed dataset via the MatchThem package
12		#' (see examples).
13		#'
14		#' The function fits the pre-specified survfit model (\code{surv_formula}, \code{\link[survival]{survfit}} package)
15		#' to compute survival probabilities at each individual time point according to the Kaplan-Meier method.
16		#' For matched and weighted datasets, weights, cluster membership (matching only) and robust
17		#' variance estimates are considered in the \code{\link[survival]{survfit}} call by default.
18		#'
19		#' Since survival probabilities typically don't follow normal distributions,
20		#' these need to be transformed to approximate normality first before pooling
21		#' across imputed datasets and time points. To that end, survival probabilities are first
22		#' transformed using a complementary log-log transformation (\code{log(-log(1-pr(surv)))})
23		#' as recommended by multiple sources (Marshall, Billingham, and Bryan (2009)).
24		#'
25		#' To pool the transformed estimates across imputed datasets and time points, the pool.scalar
26		#' function is used to apply Rubin's rule to combine pooled estimates (qbar) according to
27		#' formula (3.1.2) Rubin (1987) and to compute the corresponding total variance (t) of the pooled
28		#' estimate according to formula (3.1.5) Rubin (1987).
29		#'
30		#' The pooled survival probabilities are then back-transformed via \code{1-exp(-exp(qbar))}
31		#' for pooled survival probability estimates and \code{1-exp(-exp(qbar +/- 1.96*sqrt(t)))}
32		#' for lower and upper 95% confidence intervals. As the formula indicates, the pooled standard
33		#' error is computed as the square root of the total variance. The vertically stacked table
34		#' with transformed and backtransformed estimates is returned with the
35		#' \code{km_survival_table} table.
36		#'
37		#' Finally, the median survival time is extracted from the \code{km_survival_table} table
38		#' by determining the time the survival probability drops below .5 for the first time.
39		#' For this a sub-function of Terry M. Therneau's \code{\link[survival]{print.survfit}} function
40		#' is used. Therneau also considers some edge cases/nuisances (x = time, y = surv):
41		#' - Nuisance 1: if one of the y's is exactly .5, we want the mean of the corresponding x and the first x for which y<.5. We need to use the equivalent of all.equal to check for a .5 however: survfit(Surv(1:100)~1) gives a value of .5 + 1.1e-16 due to roundoff error.
42		#' - Nuisance 2: there may by an NA in the y's
43		#' - Nuisance 3: if no y's are <=.5, then we should return NA
44		#' - Nuisance 4: the obs (or many) after the .5 may be censored, giving a stretch of values = .5 +- epsilon
45		#'
46		#' The function follows the following logic:
47		#' 1. Fit Kaplan-Meier survival function to each imputed and matched/weighted dataset
48		#' 2. Transform survival probabilities using complementary log-log transformation
49		#' 3. Pool transformed survival probabilities and compute total variance using Rubin's rule
50		#' 4. Back-transform pooled survival probabilities and compute 95% confidence intervals
51		#' 5. Extract median survival time and corresponding 95% confidence intervals
52		#' 6. Plot Kaplan-Meier curve with pooled survival probabilities and confidence intervals
53		#'
54		#' More references:
55		#' - https://stefvanbuuren.name/fimd/sec-pooling.html
56		#' - https://link.springer.com/article/10.1007/s10198-008-0129-y
57		#' - https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-015-0048-4
58		#'
59		#' @seealso
60		#' \code{\link[survival]{survfit}} \code{\link[mice]{pool.scalar}} \code{\link[MatchThem]{matchthem}} \code{\link[MatchThem]{weightthem}}
61		#'
62		#' @param x imputed and matched (mimids) or weighted (wimids) object
63		#' @param surv_formula specification of survival model formula to be fitted
64		#'
65		#' @return list with pooled median survival estimate and pooled Kaplan-Meier curve
66		#' \code{km_median_survival}:
67		#' - strata = stratum
68		#' - t_median = median survival time
69		#' - t_lower = lower 95% CI of median survival time
70		#' - t_upper = upper 95% CI of median survival time
71		#'
72		#' \code{km_survival_table}:
73		#' - strata = stratum
74		#' - time = observed time point
75		#' - m = number of imputed datasets
76		#' - qbar = pooled univariate estimate of the complementary log-log transformed survival probabilities, see formula (3.1.2) Rubin (1987)
77		#' - t = total variance of the pooled univariate estimate of the complementary log-log transformed survival probabilities, formula (3.1.5) Rubin (1987)
78		#' - se = total standard error of the pooled estimate (derived as sqrt(t))
79		#' - surv = back-transformed pooled survival probability
80		#' - lower = Wald-type lower 95% confidence interval of back-transformed pooled survival probability
81		#' - upper = Wald-type upper 95% confidence interval of back-transformed pooled survival probability
82		#'
83		#' \code{km_plot}: ggplot2 object with Kaplan-Meier curve
84		#'
85		#' @md
86		#'
87		#' @export
88		#'
89		#' @examples
90		#' library(encore.analytics)
91		#' library(mice)
92		#' library(MatchThem)
93		#'
94		#' # simulate a cohort with 1,000 patients with 20% missing data
95		#' data <- simulate_data(
96		#' n = 500,
97		#' imposeNA = TRUE,
98		#' propNA = 0.2
99		#' )
100		#'
101		#' # impute the data
102		#' set.seed(42)
103		#' mids <- mice(data, m = 5, print = FALSE)
104		#'
105		#' # fit a propensity score model
106		#' fit <- as.formula(treat ~ dem_age_index_cont + dem_sex_cont + c_smoking_history)
107		#'
108		#' # weight (or alternatively match) patients within each imputed dataset
109		#' wimids <- weightthem(
110		#' formula = fit,
111		#' datasets = mids,
112		#' approach = "within",
113		#' method = "glm",
114		#' estimand = "ATO"
115		#' )
116		#'
117		#' # fit a survival model
118		#' km_fit <- as.formula(survival::Surv(fu_itt_months, death_itt) ~ treat)
119		#'
120		#' # estimate and pool median survival times and Kaplan-Meier curve
121		#' km_out <- km_pooling(
122		#' x = wimids,
123		#' surv_formula = km_fit
124		#' )
125		#'
126		#' # median survival time
127		#' km_out$km_median_survival
128		#'
129		#' # KM curve
130		#' km_out$km_plot
131		#'
132		km_pooling <- function(x = NULL,
133		surv_formula = stats::as.formula(survival::Surv(fu_itt_months, death_itt) ~ treat)
134		){
135
136		# input checks
137		# check if x is a mimids or wimids object
138	3x	assertthat::assert_that(inherits(x, c("mimids", "wimids")), msg = "<x> needs to be a mimids or wimids object")
139		# check if surv_formula is a formula
140	2x	assertthat::assert_that(inherits(surv_formula, "formula"), msg = "<surv_formula> needs to be a formula")
141		# check weights and subclass
142	1x	assertthat::assert_that("weights" %in% names(MatchThem::complete(x)), msg = "<x> needs to contain a weights column")
143		# check if weights and subclass are in the data
144	1x	if(inherits(x, "mimids")){
145	!	assertthat::assert_that("subclass" %in% names(MatchThem::complete(x)), msg = "<x> needs to contain a weights column")
146		}
147
148		# Fit Kaplan Meier --------------------------------------------------------
149
150		# first, we fit a survival function for ith dataset
151	1x	compute_km <- function(i){
152
153		# compute the survival function for dataset i
154
155		## for weighting
156	5x	if(inherits(x, "wimids")){
157
158	5x	survfit_fit <- survival::survfit(
159	5x	formula = surv_formula,
160	5x	weights = weights,
161	5x	robust = T,
162	5x	data = i
163		)
164
165		}
166
167		## for matching
168	5x	if(inherits(x, "mimids")){
169
170	!	survfit_fit <- survival::survfit(
171	!	formula = surv_formula,
172	!	weights = weights,
173	!	cluster = subclass,
174	!	robust = T,
175	!	data = i
176		)
177
178		}
179
180	5x	return(survfit_fit)
181
182		}
183
184
185		# compute KM across all imputed (and weighted/matched) datasets --------------------
186
187		# next, we create a list of datasets to apply the compute_km function to
188		# action = "all" : produces a mild object (list) of the multiply imputed datasets
189		# all = FALSE : do NOT include observations with a zero estimated weight
190		# include = FALSE : do NOT include original data with the missing values
191
192		# Important: the all = parameter needs to be set to FALSE
193		# since otherwise unmatched patients or those zero weight
194		# will be included, too!
195	1x	object_list <- MatchThem::complete(x, action = "all", all = FALSE, include = FALSE)
196
197		# now apply the compute_km function across all imputed and matched/weighted datasets
198		# this creates a list with computed survival probabilities
199	1x	survival_fit_list <- lapply(object_list, FUN = compute_km)
200
201
202		# pool survival probabilities ---------------------------------------------
203
204		# now to combine/pool quantities with non-normal distributions
205		# such as survival probabilities, Stef van Buuren
206		# recommends to tansform survival probabilities
207		# using a complementary log-log transformation
208		# https://stefvanbuuren.name/fimd/sec-pooling.html
209		# reference: Marshall, Billingham, and Bryan (2009) https://link.springer.com/article/10.1007/s10198-008-0129-y
210		# reference: https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-015-0048-4
211
212		# that is, the function cloglog performs the
213		# transformations on all survival probabilities
214		# stored in survival_fit_list
215	1x	cloglog <- function(i){
216
217		# convert to data.frame
218	5x	survival_times_df <- with(
219	5x	summary(i),
220	5x	data.frame(strata, time, surv, std.err, lower, upper)
221		) \|>
222	5x	dplyr::mutate(
223		# compute transformed qbar
224	5x	q = log(-log(1-surv)),
225		# compute transformed U
226	5x	u = std.err^2 / ((1-surv) * log(1-surv))^2
227		)
228
229		}
230
231		# we apply this function to all fitted survival curves
232	1x	km_survival <- lapply(survival_fit_list, FUN = cloglog) \|>
233
234		# the resulting list with q and u per stratum and dataset
235		# is then vertically stacked to a long data frame (using bind_rows).
236	1x	dplyr::bind_rows() \|>
237
238		# group by the exposure levels (stratum) and apply the pool.scalar function
239		# which pools univariate estimates of m datasets. This returns:
240		# m = The number of imputations
241		# qbar = The pooled univariate estimate, formula (3.1.2) Rubin (1987)
242		# t = The total variance of the pooled estimated, formula (3.1.5) Rubin (1987).
243	1x	dplyr::group_by(strata, time) \|>
244	1x	dplyr::summarize(
245	1x	data.frame(mice::pool.scalar(Q = q, U = u, k = 1)[c("m", "qbar", "t")]),
246	1x	.groups = "drop"
247		) \|>
248
249		# we back-transform back and compute CI's
250	1x	dplyr::mutate(
251	1x	surv = 1-exp(-exp(qbar)),
252	1x	se = sqrt(t),
253	1x	lower = 1-exp(-exp(qbar - 1.96*se)),
254	1x	upper = 1-exp(-exp(qbar + 1.96*se))
255		) \|>
256
257		# arrange by strata and descending (!) survival probability
258	1x	dplyr::arrange(strata, desc(surv))
259
260
261		# Plot a pooled Kaplan-Meier curve ----------------------------------------
262
263		# plot pooled Kaplan-Meier curve based on pooled survival probabilities
264
265	1x	km_plot <- km_survival \|>
266	1x	ggplot2::ggplot(ggplot2::aes(x = time, y = surv, color = strata, fill = strata)) +
267	1x	ggplot2::geom_line(linewidth = 1.25) +
268	1x	ggplot2::geom_ribbon(ggplot2::aes(ymin = lower, ymax = upper), linetype = 0, alpha = 0.2) +
269	1x	ggplot2::theme_minimal(base_size = 14) +
270	1x	ggplot2::labs(
271	1x	x = "Follow-up time",
272	1x	y = "Pr(Survival) [%]",
273	1x	color = "Exposure",
274	1x	fill = "Exposure"
275		) +
276	1x	ggplot2::theme(legend.position = "top")
277
278
279		# Extract median survival time --------------------------------------------
280
281		# This section is taken from Terry M. Therneau's
282		# print.survfit function in the survival package!
283
284		# to extract the median survival times, we
285		# apply the same sub-functions used in
286		# print.survfit function of the original survival package
287		# which defines the median survival time as the time
288		# the survival probability drops below .5 for the first time
289		# see https://github.com/cran/survival/blob/master/R/print.survfit.R#L109
290
291		# Therneau defined some edge cases (x = time, y = surv):
292		# Nuisance 1: if one of the y's is exactly .5, we want the mean of
293		# the corresponding x and the first x for which y<.5. We need to
294		# use the equivalent of all.equal to check for a .5 however:
295		# survfit(Surv(1:100)~1) gives a value of .5 + 1.1e-16 due to
296		# roundoff error.
297		# Nuisance 2: there may by an NA in the y's
298		# Nuisance 3: if no y's are <=.5, then we should return NA
299		# Nuisance 4: the obs (or many) after the .5 may be censored, giving
300		# a stretch of values = .5 +- epsilon
301	1x	minmin <- function(y, x){
302	6x	tolerance <- .Machine$double.eps^.5 #same as used in all.equal()
303	6x	keep <- (!is.na(y) & y <(.5 + tolerance))
304	6x	if (!any(keep)) NA
305		else {
306	6x	x <- x[keep]
307	6x	y <- y[keep]
308	6x	if (abs(y[1]-.5) <tolerance && any(y< y[1]))
309	!	(x[1] + x[min(which(y<y[1]))])/2
310	6x	else x[1]
311		}
312		}
313
314		# apply this across strata listed in the km_survival table
315	1x	minmin_strata <- function(i){
316
317	2x	km_survival_stratum <- km_survival \|>
318	2x	dplyr::filter(strata == i)
319
320		# median survival time
321	2x	med <- minmin(y = km_survival_stratum$surv, km_survival_stratum$time)
322
323		# 95% CI of median survival time
324		# = time when lower and upper
325		# 95% confidence intervals of the survival
326		# probability drop below .5
327	2x	if(!is.null(km_survival_stratum$upper)){
328	2x	upper <- minmin(y = km_survival_stratum$upper, x = km_survival_stratum$time)
329	2x	lower <- minmin(y = km_survival_stratum$lower, x = km_survival_stratum$time)
330	2x	minmin_strata_return <- c(med, lower, upper)
331
332		}else{
333
334		# Therneau assigns a 0 of no confidence intervals are available
335	!	upper <- 0
336	!	lower <- 0
337
338		}
339
340		# provide output table
341	2x	minmin_strata_return_tibble <- tibble::tibble(
342	2x	strata = i, # stratum
343	2x	t_median = med, # median survival time
344	2x	t_lower = lower, # lower 95% CI of median survival time
345	2x	t_upper = upper # upper 95% CI of median survival time
346		)
347
348		}
349
350	1x	km_median_survival <- do.call(rbind, lapply(X = unique(km_survival$strata), FUN = minmin_strata))
351
352
353		# return results ----------------------------------------------------------
354
355	1x	return(
356	1x	list(
357	1x	km_median_survival = km_median_survival,
358	1x	km_plot = km_plot,
359	1x	km_survival_table = km_survival
360		)
361		)
362
363
364		}

1		#' Install package on demand
2		#'
3		#' @description
4		#' This function checks if a package is installed. If not, it
5		#' asks the user if they want to install it. If the user
6		#' agrees, it installs the package from CRAN.
7		#'
8		#' @param pkg a character string with the name of the package
9		#' @param quiet logical. If TRUE, suppresses messages during
10		#' @param ... additional arguments passed to \code{install.packages}
11		#'
12		install_on_demand <- function(pkg, quiet = FALSE, ...) {
13		# internal function that checks whether package pkg is
14		# in the library. If not found, it asks the user permission
15		# to install.
16	16x	if (requireNamespace(pkg, quietly = TRUE)) {
17	16x	return()
18		}
19	!	if (interactive()) {
20	!	answer <- utils::askYesNo(paste("Package", pkg, "needed. Install from CRAN?"))
21	!	if (answer) utils::install.packages(pkg, quiet = quiet)
22		}
23		}