From 319ec81ed3288e24c0a092cfaf63706369de79ef Mon Sep 17 00:00:00 2001 From: Alexander McKim Date: Wed, 27 May 2026 17:01:54 -0600 Subject: [PATCH 1/6] adding tests and data, new vignette, biocparallel --- R/arg_check_ml.R | 5 +- R/core_ml.R | 151 ++++++++++--- R/data.R | 18 ++ R/generate_matrices_ml.R | 13 +- R/ife_ml.R | 18 ++ R/plot_ml.R | 55 ++++- R/prep_ml.R | 45 +++- R/run_Fisher_tests.R | 50 +++++ R/run_ML.R | 62 +++--- R/run_ml_pipeline.R | 9 + data/demo_fit.rda | Bin 0 -> 24156 bytes data/demo_ml_tibble.rda | Bin 0 -> 848 bytes inst/scripts/make_demo_data.R | 54 +++++ man/amRml-package.Rd | 5 + man/applyBenjaminiHochberg.Rd | 7 + man/buildLRModel.Rd | 6 +- man/buildRecipe.Rd | 13 +- man/buildTuningGrid.Rd | 13 +- man/buildWflow.Rd | 11 + man/calculateEvalMets.Rd | 17 ++ man/calculateMinSamples.Rd | 4 + man/computeFeatureFreq.Rd | 13 ++ man/createMLResultDir.Rd | 17 +- man/createMLinputList.Rd | 5 +- man/demo_fit.Rd | 20 ++ man/demo_ml_tibble.Rd | 22 ++ man/dot-getTargetVarName.Rd | 9 + man/encodePhenotype.Rd | 11 + man/extractTopFeats.Rd | 8 +- man/fitBestModel.Rd | 12 ++ man/getConfusionMatrix.Rd | 15 ++ man/getNumFeat.Rd | 9 + man/loadMLInputTibble.Rd | 15 +- man/plotBaselineComparison.Rd | 11 + man/plotDefaultEval.Rd | 18 +- man/plotFishers.Rd | 17 +- man/plotPRC.Rd | 17 ++ man/plotTopFeatsVI.Rd | 6 +- man/predictML.Rd | 6 + man/removeTopFeats.Rd | 11 + man/runFisherTests.Rd | 10 + man/runFishers.Rd | 14 ++ man/runIFE.Rd | 21 +- man/runMDRmodels.Rd | 12 +- man/runMLPipeline.Rd | 42 ++-- man/runMLmodels.Rd | 24 ++- man/runModelingPipeline.Rd | 11 + man/selectBestModel.Rd | 13 +- man/shuffleLabels.Rd | 10 +- man/splitMLInputTibble.Rd | 13 +- man/tuneGrid.Rd | 11 +- vignettes/intro.Rmd | 395 ++++++++++++++-------------------- 52 files changed, 995 insertions(+), 379 deletions(-) create mode 100644 R/data.R create mode 100644 data/demo_fit.rda create mode 100644 data/demo_ml_tibble.rda create mode 100644 inst/scripts/make_demo_data.R create mode 100644 man/demo_fit.Rd create mode 100644 man/demo_ml_tibble.Rd diff --git a/R/arg_check_ml.R b/R/arg_check_ml.R index 41f01c8..f6acf09 100644 --- a/R/arg_check_ml.R +++ b/R/arg_check_ml.R @@ -1,5 +1,6 @@ # This script contains functions to check the arguments of ML functions. +#' @importFrom methods is #' @importFrom tibble is_tibble NULL @@ -254,7 +255,7 @@ NULL #' `buildLRModel()` (random forest and boosted tree support planned) #' .checkArgParsnipMod <- function(parsnip_mod) { - if (class(parsnip_mod)[2] != "model_spec") { + if (!is(parsnip_mod, "model_spec")) { stop("A `parsnip` model was expected but not received.") } } @@ -418,7 +419,7 @@ NULL #' @param tune_res Results of grid tuning, such as the output of `tuneGrid()` #' .checkArgTuneRes <- function(tune_res) { - if (class(tune_res)[1] != "tune_results") { + if (!is(tune_res, "tune_results")) { stop("The `tune_res` argument can only take `tune_results` objects.") } } diff --git a/R/core_ml.R b/R/core_ml.R index 5846b40..37f04cd 100644 --- a/R/core_ml.R +++ b/R/core_ml.R @@ -12,6 +12,7 @@ #' @importFrom dplyr select #' @importFrom dplyr slice #' @importFrom hardhat tune +#' @importFrom methods is #' @importFrom parsnip augment #' @importFrom parsnip boost_tree #' @importFrom parsnip extract_fit_engine @@ -71,34 +72,42 @@ NULL #' be designated as training and validation, respectively. #' @param seed [num] For reproducible analysis #' @return An `rsplit` object +#' @examples +#' ml <- tibble::tibble( +#' genome_id = paste0("g", 1:20), +#' genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 10), +#' feat_a = rep(c(0L, 1L), 10), +#' feat_b = rep(c(1L, 0L), 10) +#' ) +#' splitMLInputTibble(ml, split = c(1, 0), seed = 42) #' @export splitMLInputTibble <- function(ml_input_tibble, split = c(0.6, 0.2), seed = 5280) { .checkArgTibble(ml_input_tibble, ml = TRUE) .checkArgSplit(split) .checkArgSeed(seed) - set.seed(seed) - target_var <- .getTargetVarName(ml_input_tibble) # Split the data, maintaining R/S proportions. - if (split[2] == 0) { - # If in CV mode: - # Still retain a stratified testing holdout purely for final reporting metrics; - # CV is only performed on the training portion. - prop_train_for_holdout <- 0.8 # 80 percent train, 20 percent reserved test - data_split <- rsample::initial_split( - ml_input_tibble, - prop = prop_train_for_holdout, - strata = !!target_var - ) - } else { - data_split <- rsample::initial_validation_split( - ml_input_tibble, - prop = split, - strata = !!target_var - ) - } + data_split <- withr::with_seed(seed, { + if (split[2] == 0) { + # If in CV mode: + # Still retain a stratified testing holdout purely for final reporting metrics; + # CV is only performed on the training portion. + prop_train_for_holdout <- 0.8 # 80 percent train, 20 percent reserved test + rsample::initial_split( + ml_input_tibble, + prop = prop_train_for_holdout, + strata = !!target_var + ) + } else { + rsample::initial_validation_split( + ml_input_tibble, + prop = split, + strata = !!target_var + ) + } + }) return(data_split) } @@ -114,6 +123,14 @@ splitMLInputTibble <- function(ml_input_tibble, split = c(0.6, 0.2), seed = 5280 #' @param pca_threshold [num] The proportion of total variance for which the #' principle components account #' @return A `recipe` object +#' @examples +#' train <- tibble::tibble( +#' genome_id = paste0("g", 1:10), +#' genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 5), +#' feat_a = rep(c(0L, 1L), 5), +#' feat_b = rep(c(1L, 0L), 5) +#' ) +#' buildRecipe(train, use_pca = FALSE) #' @export buildRecipe <- function(train_data, use_pca = FALSE, pca_threshold = 0.95) { .checkArgTibble(train_data, ml = TRUE) @@ -157,6 +174,9 @@ buildRecipe <- function(train_data, use_pca = FALSE, pca_threshold = 0.95) { #' @param multi_class [bool] Whether to construct a model for multi-class #' classification #' @return A `parsnip` `logistic_reg` object +#' @examples +#' buildLRModel() +#' buildLRModel(multi_class = TRUE) #' @export buildLRModel <- function(multi_class = FALSE) { .checkArgMultiClass(multi_class) @@ -186,6 +206,16 @@ buildLRModel <- function(multi_class = FALSE) { #' `buildLRModel()` (random forest and boosted tree support planned) #' @param recipe A recipe, such as the output of `buildRecipe()` #' @return A `workflow` object +#' @examples +#' train <- tibble::tibble( +#' genome_id = paste0("g", 1:10), +#' genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 5), +#' feat_a = rep(c(0L, 1L), 5), +#' feat_b = rep(c(1L, 0L), 5) +#' ) +#' rec <- buildRecipe(train, use_pca = FALSE) +#' lr <- buildLRModel() +#' buildWflow(lr, rec) #' @export buildWflow <- function(parsnip_mod, recipe) { .checkArgParsnipMod(parsnip_mod) @@ -210,6 +240,12 @@ buildWflow <- function(parsnip_mod, recipe) { #' regression. 0 corresponds to L2 regularization; 1 corresponds to L1; #' intermediate values (0, 1) correspond to elastic net. #' @return A logistic regression tuning grid as a tibble +#' @examples +#' buildTuningGrid( +#' model = "LR", +#' penalty_vec = 10^c(-3, -1), +#' mix_vec = c(0, 0.5, 1) +#' ) #' @export buildTuningGrid <- function( model = "LR", @@ -243,6 +279,14 @@ buildTuningGrid <- function( #' `buildTuningGrid()` #' @param n_fold [num] Number of folds of cross-validation #' @return Results of grid tuning +#' @examples +#' data(demo_ml_tibble) +#' data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +#' wflow <- buildWflow(buildLRModel(), +#' buildRecipe(rsample::training(data_split))) +#' grid <- buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)) +#' set.seed(1) +#' tuneGrid(wflow, data_split, grid, n_fold = 2) #' @export tuneGrid <- function(wflow, data_split, grid = buildTuningGrid(model = "LR"), n_fold = 5) { @@ -250,20 +294,18 @@ tuneGrid <- function(wflow, data_split, grid = buildTuningGrid(model = "LR"), .checkArgWflow(wflow) .checkArgDataSplit(data_split) - split_class <- class(data_split)[1] - # Always do CV on the training portion of the split train_df <- rsample::training(data_split) target_var <- .getTargetVarName(train_df) - if (identical(split_class, "initial_split")) { + if (is(data_split, "initial_split")) { # CV on training portion; final eval will use the held-out test set resamples <- rsample::vfold_cv(train_df, v = n_fold, strata = !!target_var) - } else if (identical(split_class, "initial_validation_split")) { + } else if (is(data_split, "initial_validation_split")) { # Use the validation partition from the original three-way split. resamples <- rsample::validation_set(data_split) } else { - stop("Unsupported rsample split object: ", split_class) + stop("Unsupported rsample split object: ", class(data_split)[1]) } tune_res <- tune::tune_grid( @@ -294,6 +336,16 @@ tuneGrid <- function(wflow, data_split, grid = buildTuningGrid(model = "LR"), #' @param select_best_metric [chr] Metric to select best model: "f_meas", #' "pr_auc", "mcc", or "bal_accuracy" #' @return Best model workflow +#' @examples +#' data(demo_ml_tibble) +#' data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +#' wflow <- buildWflow(buildLRModel(), +#' buildRecipe(rsample::training(data_split))) +#' set.seed(1) +#' tune_res <- tuneGrid(wflow, data_split, +#' buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)), +#' n_fold = 2) +#' selectBestModel(tune_res, wflow, "mcc") #' @export selectBestModel <- function(tune_res, wflow, select_best_metric = "mcc") { .checkArgTuneRes(tune_res) @@ -315,6 +367,17 @@ selectBestModel <- function(tune_res, wflow, select_best_metric = "mcc") { #' training. This can be the output of #' `rsample::training(splitMLInputTibble(ml_input_tibble))`. #' @return Best model fit +#' @examples +#' data(demo_ml_tibble) +#' data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +#' train <- rsample::training(data_split) +#' wflow <- buildWflow(buildLRModel(), buildRecipe(train)) +#' set.seed(1) +#' tune_res <- tuneGrid(wflow, data_split, +#' buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)), +#' n_fold = 2) +#' best_wflow <- selectBestModel(tune_res, wflow, "mcc") +#' fitBestModel(best_wflow, train) #' @export fitBestModel <- function(final_mod, train_data) { .checkArgWflow(final_mod) @@ -361,6 +424,11 @@ fitBestModel <- function(final_mod, train_data) { #' `rsample::testing(splitMLInputTibble(ml_input_tibble))`. #' @return Test data (tibble) with an added column for predicted phenotype #' labels +#' @examples +#' data(demo_ml_tibble) +#' data(demo_fit) +#' data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +#' predictML(demo_fit, rsample::testing(data_split)) #' @export predictML <- function(fit, test_data) { .checkArgWflow(fit) @@ -379,6 +447,20 @@ predictML <- function(fit, test_data) { #' @param test_data_plus_predictions Test data (tibble) with an added column for #' predicted phenotype labels, such as the output of `predictML()` #' @return Confusion matrix of class `conf_mat` +#' @examples +#' preds <- tibble::tibble( +#' genome_id = paste0("g", 1:8), +#' genome_drug.resistant_phenotype = factor( +#' rep(c("Resistant", "Susceptible"), each = 4), +#' levels = c("Resistant", "Susceptible") +#' ), +#' .pred_class = factor( +#' c("Resistant", "Resistant", "Susceptible", "Resistant", +#' "Susceptible", "Resistant", "Susceptible", "Susceptible"), +#' levels = c("Resistant", "Susceptible") +#' ) +#' ) +#' getConfusionMatrix(preds) #' @export getConfusionMatrix <- function(test_data_plus_predictions) { .checkArgTestDataPlusPredictions(test_data_plus_predictions) @@ -613,6 +695,22 @@ getConfusionMatrix <- function(test_data_plus_predictions) { #' #' @inheritParams getConfusionMatrix #' @return F1 score, AUPRC, balanced accuracy, nMCC, and log2(AUPRC/prior) +#' @examples +#' preds <- tibble::tibble( +#' genome_id = paste0("g", 1:10), +#' genome_drug.resistant_phenotype = factor( +#' rep(c("Resistant", "Susceptible"), each = 5), +#' levels = c("Resistant", "Susceptible") +#' ), +#' .pred_class = factor( +#' c("Resistant", "Resistant", "Susceptible", "Resistant", "Susceptible", +#' "Susceptible", "Resistant", "Susceptible", "Susceptible", "Resistant"), +#' levels = c("Resistant", "Susceptible") +#' ), +#' .pred_Resistant = c(0.9, 0.8, 0.4, 0.7, 0.3, 0.2, 0.6, 0.1, 0.2, 0.55), +#' .pred_Susceptible = c(0.1, 0.2, 0.6, 0.3, 0.7, 0.8, 0.4, 0.9, 0.8, 0.45) +#' ) +#' calculateEvalMets(preds) #' @export calculateEvalMets <- function(test_data_plus_predictions) { .checkArgTestDataPlusPredictions(test_data_plus_predictions) @@ -644,6 +742,9 @@ calculateEvalMets <- function(test_data_plus_predictions) { #' @return A tibble with a column for top features (`Variable`), a column for #' `Importance`, and a column for `Sign` (or, for multi-class, a tibble with #' per-class columns of importance scores for each `Variable`) +#' @examples +#' data(demo_fit) +#' extractTopFeats(demo_fit, n_top_feats = 10) #' @export extractTopFeats <- function( fit, prop_vi_top_feats = c(0, 1), @@ -691,7 +792,7 @@ extractTopFeats <- function( # Take a different approach if using multi-class (the previous code would give # a less meaningful result). - if (class(fit$fit$actions$model$spec)[1] == "multinom_reg") { + if (is(fit$fit$actions$model$spec, "multinom_reg")) { warning(paste( "Extracting top features from a multi-class model.", "The `prop_vi_top_feats` and `n_top_feats` arguments do not apply." diff --git a/R/data.R b/R/data.R new file mode 100644 index 0000000..0f06f74 --- /dev/null +++ b/R/data.R @@ -0,0 +1,18 @@ +#' Demo ML input tibble +#' +#' Stratified subset (30 Resistant + 30 Susceptible) of the AMP-genes-binary +#' matrix from the bundled `Sfl_parquet.duckdb`, restricted to 80 feature +#' columns. +#' +#' @format A tibble with 60 rows and 82 columns: `genome_id`, +#' `genome_drug.resistant_phenotype`, and 80 binary feature columns. +#' @source `inst/scripts/make_demo_data.R`. +"demo_ml_tibble" + +#' Demo LR fit +#' +#' A tuned logistic-regression workflow fitted on [demo_ml_tibble]. +#' +#' @format A fitted `workflow` object (output of [fitBestModel()]). +#' @source `inst/scripts/make_demo_data.R`. +"demo_fit" diff --git a/R/generate_matrices_ml.R b/R/generate_matrices_ml.R index 67ba14c..69a6a87 100644 --- a/R/generate_matrices_ml.R +++ b/R/generate_matrices_ml.R @@ -654,10 +654,10 @@ skipImbalancedMatrix <- function(genome_ids, grouped <- split(group_df, group_df$drug_or_class) - purrr::walk(names(grouped), function(drug_class) { + new_files <- purrr::map(names(grouped), function(drug_class) { group <- grouped[[drug_class]] - purrr::walk(unique(group$stratification), function(leave_one_out) { + purrr::map(unique(group$stratification), function(leave_one_out) { subset <- dplyr::filter(group, stratification != leave_one_out) if (nrow(subset) == 0) { return(NULL) @@ -675,11 +675,12 @@ skipImbalancedMatrix <- function(genome_ids, ) )) arrow::write_parquet(combined, out_file) - created <<- c(created, out_file) - log("debug", paste0("Created LOO file: ", out_file)) - }) - }) + out_file + }) |> purrr::compact() |> unlist() + }) |> unlist() + + created <- c(created, new_files) } log("info", "All LOO matrices generated and saved.") diff --git a/R/ife_ml.R b/R/ife_ml.R index 3d72a6d..cd12268 100644 --- a/R/ife_ml.R +++ b/R/ife_ml.R @@ -17,6 +17,16 @@ NULL #' `runMLPipeline()`, containing all features to be removed from the #' `ml_input_tibble` #' @return An amR pangenome with top features removed +#' @examples +#' ml <- tibble::tibble( +#' genome_id = paste0("g", 1:6), +#' genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 3), +#' feat_a = c(1L, 0L, 1L, 0L, 1L, 0L), +#' feat_b = c(0L, 1L, 0L, 1L, 0L, 1L), +#' feat_c = c(1L, 1L, 0L, 0L, 1L, 0L) +#' ) +#' top <- tibble::tibble(Variable = c("feat_a", "feat_c")) +#' removeTopFeats(ml, top) #' @export removeTopFeats <- function(ml_input_tibble, top_feat_tibble) { .checkArgTibble(ml_input_tibble) @@ -60,6 +70,14 @@ removeTopFeats <- function(ml_input_tibble, top_feat_tibble) { #' @param verbose [bool] The function will stay quiet if set to `FALSE`. #' @return A tibble with IFE performance (note: this will be returned within a #' list along with top features removed per iteration if `return_feats = TRUE`.) +#' @examples +#' data(demo_ml_tibble) +#' set.seed(1) +#' runIFE( +#' demo_ml_tibble, by_num = TRUE, by_vi = FALSE, +#' percent_removal_vec = c(25, 50), mix_vec = 0, +#' return_feats = TRUE, verbose = FALSE +#' ) #' @export runIFE <- function( ml_input_tibble, by_num = TRUE, by_vi = FALSE, diff --git a/R/plot_ml.R b/R/plot_ml.R index 90121f3..84aea3d 100644 --- a/R/plot_ml.R +++ b/R/plot_ml.R @@ -30,6 +30,22 @@ NULL #' @param test_data_plus_predictions Test data (tibble) with an added column for #' predicted phenotype labels, such as the output of `predict()`. #' @return A precision-recall curve as a `ggplot2` object +#' @examples +#' preds <- tibble::tibble( +#' genome_id = paste0("g", 1:10), +#' genome_drug.resistant_phenotype = factor( +#' rep(c("Resistant", "Susceptible"), each = 5), +#' levels = c("Resistant", "Susceptible") +#' ), +#' .pred_class = factor( +#' c("Resistant", "Resistant", "Susceptible", "Resistant", "Susceptible", +#' "Susceptible", "Resistant", "Susceptible", "Susceptible", "Resistant"), +#' levels = c("Resistant", "Susceptible") +#' ), +#' .pred_Resistant = c(0.9, 0.8, 0.4, 0.7, 0.3, 0.2, 0.6, 0.1, 0.2, 0.55), +#' .pred_Susceptible = c(0.1, 0.2, 0.6, 0.3, 0.7, 0.8, 0.4, 0.9, 0.8, 0.45) +#' ) +#' plotPRC(preds) #' @export plotPRC <- function(test_data_plus_predictions) { .checkArgTestDataPlusPredictions(test_data_plus_predictions) @@ -54,6 +70,9 @@ plotPRC <- function(test_data_plus_predictions) { #' @param fit Best model fit, such as the output of `fitBestModel()` #' @param n_top_feats [num] Number of top features to plot #' @return Variable importance plot (a `ggplot2` object) +#' @examples +#' data(demo_fit) +#' plotTopFeatsVI(demo_fit, n_top_feats = 10) #' @export plotTopFeatsVI <- function(fit, n_top_feats = 10) { .checkArgWflow(fit) @@ -83,6 +102,15 @@ plotTopFeatsVI <- function(fit, n_top_feats = 10) { #' @param ylab [chr] Label for y axis #' @return A `ggplot2` scatterplot (performance metric or runtime vs. #' `train_prop` or `n_fold`), colored by model +#' @examples +#' default_eval <- tibble::tibble( +#' train_prop = c(0.5, 0.6, 0.7, 0.5, 0.6, 0.7), +#' n_fold = rep(5, 6), +#' model = rep(c("LR", "RF"), each = 3), +#' avg_f1_score = c(0.72, 0.78, 0.83, 0.70, 0.75, 0.80) +#' ) +#' plotDefaultEval(default_eval, x_default_eval = "train_prop", +#' y_default_eval = "avg_f1_score") #' @export plotDefaultEval <- function( default_eval_tibble, x_default_eval = "train_prop", @@ -138,6 +166,16 @@ plotDefaultEval <- function( #' @param shuffled_label_results Output of `runMLPipeline()` #' (`shuffle_labels = TRUE`) #' @return A bar plot with balanced accuracy comparisons per antibiotic +#' @examples +#' non_shuffled <- tibble::tibble( +#' antibiotic = c("AMP", "CIP", "CRO"), +#' bal_acc = c(0.88, 0.81, 0.92) +#' ) +#' shuffled <- tibble::tibble( +#' antibiotic = c("AMP", "CIP", "CRO"), +#' bal_acc = c(0.52, 0.49, 0.55) +#' ) +#' plotBaselineComparison(non_shuffled, shuffled) #' @export plotBaselineComparison <- function( non_shuffled_label_results, @@ -200,10 +238,19 @@ plotBaselineComparison <- function( #' Labels are applied only to the top-ranked features to preserve clarity. #' #' @examples -#' \dontrun{ -#' plotFishers(fisher_results) -#' plotFishers(fisher_results, label_top_n = 0) -#' } +#' long <- tibble::tibble( +#' genome_id = rep(paste0("g", 1:10), each = 2), +#' feature_id = rep(c("gene_a", "gene_b"), 10), +#' value = c(1, 0, 1, 0, 1, 1, 1, 1, 0, 1, +#' 0, 0, 0, 1, 0, 1, 0, 1, 0, 0), +#' genome_drug.resistant_phenotype = rep( +#' rep(c("Resistant", "Susceptible"), each = 5), each = 2 +#' ) +#' ) +#' tmp <- tempfile(fileext = ".parquet") +#' arrow::write_parquet(long, tmp) +#' fisher_results <- runFishers(tmp, Q = 0.05) +#' plotFishers(fisher_results, alpha = 0.05, label_top_n = 2) #' #' @import ggplot2 #' @import dplyr diff --git a/R/prep_ml.R b/R/prep_ml.R index 4a5954e..cbacc91 100644 --- a/R/prep_ml.R +++ b/R/prep_ml.R @@ -73,6 +73,18 @@ NULL #' upstream processing. #' @returns A tibble (one row per genome, one column per feature), plus #' "Resistant" or "Susceptible" AMR phenotype labels +#' @examples +#' long <- tibble::tibble( +#' genome_id = rep(paste0("g", 1:6), each = 2), +#' feature_id = rep(c("gene_a", "gene_b"), 6), +#' value = c(1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1), +#' genome_drug.resistant_phenotype = rep( +#' rep(c("Resistant", "Susceptible"), each = 3), each = 2 +#' ) +#' ) +#' tmp <- tempfile(fileext = ".parquet") +#' arrow::write_parquet(long, tmp) +#' loadMLInputTibble(tmp) #' @export loadMLInputTibble <- function(parquet_path) { .checkArgPath(parquet_path) @@ -135,6 +147,14 @@ loadMLInputTibble <- function(parquet_path) { #' (multi-class classification for determining the drug classes to which each #' genome is resistant), but not both. #' @return Number of features +#' @examples +#' ml <- tibble::tibble( +#' genome_id = paste0("g", 1:6), +#' genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 3), +#' feat_a = c(1L, 0L, 1L, 0L, 1L, 0L), +#' feat_b = c(0L, 1L, 0L, 1L, 0L, 1L) +#' ) +#' getNumFeat(ml) #' @export getNumFeat <- function(ml_input_tibble) { .checkArgTibble(ml_input_tibble, ml = TRUE) @@ -168,19 +188,27 @@ getNumFeat <- function(ml_input_tibble) { #' genome is resistant), but not both. #' @param seed [num] For reproducible analysis #' @return Pangenome with randomly shuffled AMR phenotype labels +#' @examples +#' ml <- tibble::tibble( +#' genome_id = paste0("g", 1:6), +#' genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 3), +#' feat_a = c(1L, 0L, 1L, 0L, 1L, 0L) +#' ) +#' shuffleLabels(ml, seed = 42) #' @export shuffleLabels <- function(ml_input_tibble, seed = 5280) { .checkArgTibble(ml_input_tibble, ml = TRUE) .checkArgSeed(seed) - set.seed(seed) target_var <- .getTargetVarName(ml_input_tibble) log <- .ml_logger("debug") log("debug", "Shuffling phenotype labels (for baseline models).") - ml_input_tibble |> - dplyr::mutate(!!target_var := sample(!!target_var)) + withr::with_seed(seed, { + ml_input_tibble |> + dplyr::mutate(!!target_var := sample(!!target_var)) + }) } #' calculateMinSamples() @@ -190,6 +218,9 @@ shuffleLabels <- function(ml_input_tibble, seed = 5280) { #' @param smallest_n_obs_rs [numeric] Minimum number of observations of the #' rarer class required per fold/partition. Default is 1. #' @return Minimum total observations required, adjusted by smallest_n_obs_rs +#' @examples +#' calculateMinSamples(n_fold = 5, split = c(1, 0), res_prop = 0.3) +#' calculateMinSamples(n_fold = 5, split = c(0.6, 0.2), res_prop = 0.1) #' @export calculateMinSamples <- function(n_fold, split, res_prop, smallest_n_obs_rs = 1) { base <- .calculateMinSamples(n_fold, split, res_prop) @@ -210,6 +241,14 @@ calculateMinSamples <- function(n_fold, split, res_prop, smallest_n_obs_rs = 1) #' @return The name of the target variable to be used for machine learning: #' either `rlang::sym("genome_drug.resistant_phenotype")` or #' `rlang::sym("resistant_classes")` +#' @examples +#' ml <- tibble::tibble( +#' genome_id = paste0("g", 1:4), +#' genome_drug.resistant_phenotype = c("Resistant", "Susceptible", +#' "Resistant", "Susceptible"), +#' feat_a = c(1L, 0L, 1L, 0L) +#' ) +#' .getTargetVarName(ml) #' @export .getTargetVarName <- function(ml_input_tibble) { .checkArgTibble(ml_input_tibble, ml = TRUE) diff --git a/R/run_Fisher_tests.R b/R/run_Fisher_tests.R index e0cf7f5..abaf42c 100644 --- a/R/run_Fisher_tests.R +++ b/R/run_Fisher_tests.R @@ -24,6 +24,16 @@ NULL #' @return A list with: #' - `df`: the input data with AMR phenotype now encoded as integer #' - `target`: a binarized vector of the encoded AMR phenotypes +#' @examples +#' df <- tibble::tibble( +#' genome_id = paste0("g", 1:6), +#' genome_drug.resistant_phenotype = c("Resistant", "Susceptible", "Resistant", +#' "Susceptible", "Resistant", "Susceptible"), +#' gene_a = c(1L, 0L, 1L, 0L, 1L, 0L), +#' gene_b = c(0L, 1L, 1L, 0L, 0L, 1L) +#' ) +#' encoded <- encodePhenotype(df) +#' encoded$target #' @export encodePhenotype <- function(df, susceptible_label = "Susceptible", resistant_label = "Resistant") { stopifnot("genome_drug.resistant_phenotype" %in% colnames(df)) # Throw a fit if it's the wrong matrix @@ -49,6 +59,15 @@ encodePhenotype <- function(df, susceptible_label = "Susceptible", resistant_lab #' @param alternative Type of Fisher test: "two.sided", "greater", or "less" #' #' @return A tibble with columns: gene, p_value +#' @examples +#' df <- tibble::tibble( +#' genome_id = paste0("g", 1:10), +#' genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 5), +#' gene_a = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0), +#' gene_b = c(0, 0, 0, 1, 1, 1, 1, 1, 1, 0) +#' ) +#' encoded <- encodePhenotype(df) +#' runFisherTests(encoded$df, encoded$target) #' @export runFisherTests <- function(df, target, alternative = "two.sided") { feature_cols <- df |> @@ -77,6 +96,12 @@ runFisherTests <- function(df, target, alternative = "two.sided") { #' @param Q The FDR threshold (default = 0.05) #' #' @return Returns Fisher results with added cols for adj_p_value, sig_after_bh, Q +#' @examples +#' fisher_results <- tibble::tibble( +#' gene = paste0("gene_", 1:5), +#' p_value = c(0.001, 0.01, 0.04, 0.2, 0.5) +#' ) +#' applyBenjaminiHochberg(fisher_results, Q = 0.05) #' @export applyBenjaminiHochberg <- function(df_fisher, Q = 0.05) { stopifnot("p_value" %in% colnames(df_fisher)) @@ -102,6 +127,18 @@ applyBenjaminiHochberg <- function(df_fisher, Q = 0.05) { #' @param target The binarized AMR phenotype "target" output from before #' #' @return The BH Fisher data table with added feature frequency columns +#' @examples +#' df <- tibble::tibble( +#' genome_id = paste0("g", 1:10), +#' genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 5), +#' gene_a = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0), +#' gene_b = c(0, 0, 0, 1, 1, 1, 1, 1, 1, 0) +#' ) +#' encoded <- encodePhenotype(df) +#' fisher <- applyBenjaminiHochberg( +#' runFisherTests(encoded$df, encoded$target), Q = 0.05 +#' ) +#' computeFeatureFreq(encoded$df, fisher, encoded$target) #' @export computeFeatureFreq <- function(df, df_fisher, target) { df_features <- df |> @@ -135,6 +172,19 @@ computeFeatureFreq <- function(df, df_fisher, target) { #' @param resistant_label "Resistant" phenotype label in the data #' #' @return The Fisher test results, BH correction, and feature frequencies +#' @examples +#' long <- tibble::tibble( +#' genome_id = rep(paste0("g", 1:10), each = 2), +#' feature_id = rep(c("gene_a", "gene_b"), 10), +#' value = c(1, 0, 1, 0, 1, 1, 1, 1, 0, 1, +#' 0, 0, 0, 1, 0, 1, 0, 1, 0, 0), +#' genome_drug.resistant_phenotype = rep( +#' rep(c("Resistant", "Susceptible"), each = 5), each = 2 +#' ) +#' ) +#' tmp <- tempfile(fileext = ".parquet") +#' arrow::write_parquet(long, tmp) +#' runFishers(tmp, Q = 0.05) #' @export runFishers <- function( matrix_path, diff --git a/R/run_ML.R b/R/run_ML.R index 2ed07e7..06caff2 100644 --- a/R/run_ML.R +++ b/R/run_ML.R @@ -1,7 +1,8 @@ -#' @keywords internal #' Pulls the ML parameters .json and reads what model split parameters are to be #' used. These defaults can be overridden if you so choose, but consider regenerating #' the ML matrices with these new split/CV values instead. +#' +#' @keywords internal #' @noRd .resolveSplitParams <- function(parquet_path, defaults = list( @@ -49,19 +50,11 @@ #' } #' #' @examples -#' \dontrun{ -#' # Basic directory structure -#' paths <- createMLResultDir("/path/to/results") -#' -#' # LOO analysis stratified by year -#' paths_loo <- createMLResultDir("/path/to/results", -#' stratify_by = "year", -#' LOO = TRUE -#' ) -#' -#' # MDR analysis -#' paths_mdr <- createMLResultDir("/path/to/results", MDR = TRUE) -#' } +#' out_dir <- file.path(tempdir(), "amRml_createdir_example") +#' dir.create(out_dir, showWarnings = FALSE, recursive = TRUE) +#' createMLResultDir(out_dir) +#' createMLResultDir(out_dir, stratify_by = "year", LOO = TRUE) +#' createMLResultDir(out_dir, MDR = TRUE) #' #' @export createMLResultDir <- function(path, @@ -533,7 +526,7 @@ createMLinputList <- function(path, #' Run MDR (multi-drug resistance) machine learning models #' #' Executes machine learning pipeline for MDR analysis using logistic regression -#' with parallel processing via the future backend. Trains models on all MDR +#' with parallel processing via BiocParallel. Trains models on all MDR #' parquet files and saves results to designated output directories. #' #' @param path Character scalar. Base directory containing MDR matrix files. @@ -600,21 +593,19 @@ runMDRmodels <- function(path, return(invisible(NULL)) } - old_plan <- future::plan() - on.exit(future::plan(old_plan), add = TRUE) - future::plan(future::multisession, workers = threads) + param <- BiocParallel::SnowParam(workers = threads, type = "SOCK", RNGseed = 42) if (isTRUE(verbose)) { - nw <- tryCatch(future::nbrOfWorkers(), error = function(e) NA_integer_) - message("runMDRmodels(): enabling multisession with workers = ", nw) + message("runMDRmodels(): enabling SnowParam with workers = ", threads) } # Auto tags for shuffled and PCA shuffle_tag <- if (isTRUE(shuffle_labels)) "shuffled_" else "" pca_tag <- if (isTRUE(use_pca)) paste0("_pca", as.character(pca_threshold)) else "" - results_list <- future.apply::future_lapply( + results_list <- BiocParallel::bplapply( seq_len(nrow(files)), + BPPARAM = param, FUN = function(i) { ref_parquet <- files$ref_file[i] output_prefix <- files$output_prefix[i] @@ -697,8 +688,7 @@ runMDRmodels <- function(path, } NULL - }, - future.seed = TRUE + } ) if (verbose) { @@ -790,8 +780,7 @@ runMDRmodels <- function(path, #' @note #' This function requires the following packages: #' \itemize{ -#' \item \pkg{future} - for parallel processing backend -#' \item \pkg{future.apply} - for parallel lapply +#' \item \pkg{BiocParallel} - for parallel processing backend #' \item \pkg{readr} - for reading/writing TSV files #' \item \pkg{dplyr}, \pkg{purrr}, \pkg{stringr}, \pkg{tibble} - for data manipulation #' } @@ -876,13 +865,10 @@ runMLmodels <- function(path, return(invisible(NULL)) } - old_plan <- future::plan() - on.exit(future::plan(old_plan), add = TRUE) - future::plan(future::multisession, workers = threads) + param <- BiocParallel::SnowParam(workers = threads, type = "SOCK", RNGseed = 42) if (isTRUE(verbose)) { - nw <- tryCatch(future::nbrOfWorkers(), error = function(e) NA_integer_) - message("runMLmodels(): enabling multisession with workers = ", nw) + message("runMLmodels(): enabling SnowParam with workers = ", threads) } # LOO / cross-test configuration prefix @@ -909,8 +895,9 @@ runMLmodels <- function(path, shuffle_tag <- if (isTRUE(shuffle_labels)) "shuffled_" else "" pca_tag <- if (isTRUE(use_pca)) paste0("_pca", as.character(pca_threshold)) else "" - results_list <- future.apply::future_lapply( + results_list <- BiocParallel::bplapply( seq_len(nrow(files)), + BPPARAM = param, FUN = function(i) { ref_parquet <- files$ref_file[i] output_prefix <- files$output_prefix[i] @@ -1004,8 +991,7 @@ runMLmodels <- function(path, } NULL - }, - future.seed = TRUE + } ) if (verbose) { @@ -1040,6 +1026,16 @@ runMLmodels <- function(path, #' #' @return Invisibly returns the output directory used for ML results. #' +#' @examples +#' \dontrun{ +#' runModelingPipeline( +#' parquet_duckdb_path = "path/to/Bug_parquet.duckdb", +#' threads = 16, n_fold = 5, split = c(1, 0), min_n = 25, +#' prop_vi_top_feats = c(0, 1), pca_threshold = 0.99, +#' verbose = TRUE, use_saved_split = TRUE +#' ) +#' } +#' #' @export runModelingPipeline <- function(parquet_duckdb_path, threads = 16, diff --git a/R/run_ml_pipeline.R b/R/run_ml_pipeline.R index ad9f691..797cab0 100644 --- a/R/run_ml_pipeline.R +++ b/R/run_ml_pipeline.R @@ -74,6 +74,15 @@ NULL #' `top_feat_tibble`. Tuning results, the fit object, and model predictions may #' also be returned if `return_tune_res`, `return_fit`, and/or `return_pred`, #' respectively, are set to `TRUE`. +#' @examples +#' data(demo_ml_tibble) +#' set.seed(1) +#' runMLPipeline( +#' ml_input_tibble = demo_ml_tibble, model = "LR", +#' split = c(1, 0), n_fold = 2, +#' penalty_vec = 10^c(-3, -1), mix_vec = c(0, 0.5, 1), +#' n_top_feats = 10, verbose = FALSE +#' ) #' @export runMLPipeline <- function( ml_input_tibble, model = "LR", split = c(0.6, 0.2), diff --git a/data/demo_fit.rda b/data/demo_fit.rda new file mode 100644 index 0000000000000000000000000000000000000000..b3fa4ea4ce384612b9300f98e5a8582fb52b8e9b GIT binary patch literal 24156 zcmV(xKvQ&2UKVgRpfklK zA+`t`^LEM5g0JmSTuuD9u`sR7lvVPfxsw3wsi7u2d)*JnB3liS_#LP6kc@u0Y6xMz zdgH5iPS4c|CUbHDvHW}$2gZZdYB}udY#u=jPz}8ERzj*BnMU!+F$R?OF5*~M`g2uV zRk{&#(lotowWxV(oqm1fpPtEX<-Y#~)f1DC_2yY^d#Q8{L3+4Rlnd-@c$S77_Dyvo zV&XLlteeZH=>dD8>&bEO9atcffcwOQ8vt7c1emrRt8jPx)M1P#uE_M=jx0KG%*8<8 z{vT7f-DH%u#p8jC6*rvc)5fP^BNNGFT=0ZWtQ3i?3Nl&F=~V!d^6S&86Oy-_whDCu z&*Ty0jCb}xgs-{E6woyK01g_Cg@Z3}_z;4VT{zvkV=U<10NZ%_ZRQ?ue6rU5z2wzI zB*OEy{a?ib%7`Y0!n>IpYdAy{0%E$^+a8hX&G@yn*2aYyZ;sIrz7;450<&0M<8&RAx>dt z#myLp=nX*O=*dkFI`ezmtC9#ZMys_=e2^8DH)n?E4zR`ogi%;Mrsh?$^-)|hqF0*K z5x-*6GNOTpd_qJRcnW*yQRVmr#kGs^kd;nHSr*%$zp^d1@dV{=6oa)ukgV3u zX!C8#&o^0E16?n6S(O<@CNpNqU_~J+amhMuKI1ke+;1VD6pQtq)a;~=n{huQpS=FFQA zm1tyLYvy8My5<5<2=T=j zEANw#zN8?k_5cDNxJb!erLG5b%-Bjm-Z~)QaWmu0l{9AG+{AQ{WASN0L)*-|I9RE1 zlepc9wM7iy5P;|<72x;n3(` zGyiFTJ{Y~0iL|GP+pWqXG!+;UUau!o!fz)@I0yHRzG?mHMxhHzP+cquDZL>$oMPjl z84;Y#7Z%)h5vYGUk57YZ zq5Q8!P^IrwAetfk;NlfKA)_zB1%K8I0o9Uey-3CGropP*wz!HEc55Bv>@V^1Qb0cE zh2>gI8qP#Lry_Nwrx0m`ro$X$M>G@%3D{<5C=$dtETO?`WIp;8je(4<415pCnK|J(3mpi7BP z>YaFbKMu_miw%*;gD*;HwVwmEzIXJ}bn+NF+88Wk%@21bcB6LZ1j`gFRm;60&%qEt z)#f$YImf9uVqtlGz(d$tomOVyyM;q~}TPlF1^_q{Dux0EkXo=tN?17u8Fh4S))b|sFwnYklfEyWmTWZ z_7Igz5=hZKtiUfa{{H-aQ&w;!N7S90dqIYf#qWGRNCH&Vryo>stf{el!&Og5eT}22 zXG>jzA&Jw9;nJ#jke;hqt2ojLYidnccn?OCz~=(h2Zv%OC+BFHgOszD;_T?(k%)7U zh%Dh>a!8s0>-Q%WV{hb2QpT)I7Ea#jL5JaGyvFi85Qg{AZz^ZAmu6n%C#=WZdMf^> zx{jL}!uNzz-x}>4dqB5s>>8aUcS50bum+Mc!wBlbQ(zckZ$l2vQbxLxI%$|bHXp6i z+dp(Kg`2YjF#mQ{TPaL=jU4KgM&ImCF>wMRYkb`ej@^T67_(qaSu~R>@hoKLk*pH~ zrz&OnjWvCJ>j9~7LGr)hzyQru%%h;sU}FT`Lp`3;)PVn_#?$jgFXD){>Xb$8mlB=t z37~;z5-70CX}|B}oh})4V(p39bm&NAY~K5!fMYcaN^hDuDo~@)$4tMrQ2VLgVEBU@ zl>Ag;e#h2E3sUr4(X6XrtH5rm>v+s>-q`pGBQEBDe+NKj#hVIw#-fd%suTFcn%<+R zi4LrCQ~jdNnlKKT9@eL0EQY0K#sPQq{GFe#7(=@E>BXcDNZ;7s&zy+`Vcfoy2&KAt2RG^6_wVIz@r ztRi!t|G697cFnDH#vZC=lSRjFNiCzJN zJ4gEyqDAoGm;_`}g3#;?WArzwWQeux9UCoiuEXgRb6JsppxKZJ-lb#%T%h^!Px{aC z5*9WskV&rP>StUQAhVt#7A)M?eZ(3AXv2`mZCjT(y%b0YwbjZFSAKM%s;Ed}9h>uM zcbb9y2nt6UODQg?3~E47soTe8-wy=~YnLRQ-AF77!;Vb)#?$@lv;dBCjv<>*=rS-B_l!8YpJC+*u=JB17B467_+=- z{g4bpv5si=>uboCnHMmB^S-rD8b9CFJpekvgJVM88z%dYsrA8wQ`Ytz@$*~W%oOaz zVxrDX^%}k8b@17}z-~zmXhtK?JIfKEEHQ?%H0#nuF(NU?FvMX_LKPZzTQn-`Wt@Ma zbtJA}h}#JO0_#5NGoDcW`{3VSdPklbTXk$ddcL@u7EG%KP@|6A5VpqkM=~C{yXc*b zxJZs&H7LI|77yWb($*4x`vtlF0)+ZWC{Vl}LHjIILjbQylP_4{MvL~cUEiPyO2D1d z#b^Af@8S+aP}E&`7y@_~iE!#4EnXk23a7KSnQkVBaW zX~VbtiObU0;AnQr2wF3>kdUIGCZ-C$j;-Sv(onqaf%ZwVv%fjOVv4$Q9*uorGqWKmB4am{^}&SE3^d2Q9P zccvo=RKyh>uK6If|FT^YRR%5WlmC}LSwr$1b4ead9~bYC~zuZVA_Wl zXI(#Ou}Y}u!+(W{Ft+Hk*V%qIt_{Ga$2ScwmC+oE7hXk`$| zle_3bSf*RY20S*(sE3qbiUY?A`4bM-24H2hI;bqsY#Fe33OJHLLYpD_zf)*tGFFCT z8#i2-BGbNaQ%6{s1z9F)3CffFwr)gIkvn?k*;mB}mBKYfeu{VkP0)!bVp&;6R5UUV z9=q~lfBfOk+i3xFjX3uonmAC807^&-*aMBM^7G{%tel%6bcUvqpDVSY2}C<*w{4v^ zp#QUedMdFyiTu~q%~@ZLaCxS;{r$(sr8(^g>h|QwPm77VWA6UDg;5+rme|8Jfp&xv zLAO;w!+_yGlUyLgmc37BWR}#ss_)8;mf+S-X)Lp!#&n-tA-_I#y=OtotvuI)>m(As zTBpER5;)HH_g7ld^c>7vaf#p7*rMYYa$=#vbgTCYG9tvtT7s^)^C-2;GNT*o%*#^daRi-ZrU9cQp&9ApLXlYWj zFB#k=Sg49p%W2Yt6Hwz6&BceW-A&aS&_neY$4r-HD2Qf*?cG93(~xqU3e;CB)aoQW zgJ?~p3nHQGl(|aKw#iAwXmb?3Tds_hChLlG*v>p~!&6i9PJDHW4F8l%=hwcp-{>R$ zdC&EQKd8S@ZAp&-OH#Hh;DKwKR!XWKE`_M_4*9IL6nBPJ$~5ZBfn0nD^NWFmA|b*k zv0hNwhMpvs$y??WA-;%NdrTse6WsIbyYSsQBv05%i&c+0YOtAz2F$t^)U*wyJP!<$ zUNr6S!|LCH?*|~OwX-KCz0o*xb}v~lMVrD4;4VoYOuBZhrhosEGJedjES<9;H@SJw z`L+Um{u+jOXTQrj8FsiU@zQ8rSt#kl{vVRtBLPXvs){Xq>ImQ`PSR2b5ghIl3EHR= z=*RB&Zbq{Cwf~wR@*w({L%=6}X>B87M(BmBBDEejRI9230K(vcg}u-C{va5OySRy? z=0}#IR81i~6&ny#G*dZqvg&80IcoR%R0F`p#eTsKI$yLez`9L?xP`6f7r_=*z6QoI z)(e~q^JfYXLlcMk+BLY~<0KXM1|9Cg_HQYN0B8`rftfllb01_1-qazMfadJ$1r+)1 z03f=+gJeRPi_g>C=o+mmyfqv@Zhv34a-`W?OBk#gUUz*q z3DqV^57nfNEjB8X9DYozypj8o%bbOVZE0vwhdFxrp-9Tbd`gUi>#;`mDpWk)2>SPb zZIv+ZMMz21e^t|gwUq?GgqD?;&GEiI+bJ!YA*w8y>a`d(Xluf zstm6S8QIz=ZA4FOO>5ZR9HB3pqs|Mv3{nA**9reCd*;Cl2iC#@6o5ux3`U1VxZFtM z`xeFb)fX95dokNGuAWf$6KVUuG_W4KOqTsQ|C00R&W$9REgotd^Amh5X_kf`(GLP9 zRvZpMn3#!lb*!E)6i1+YgNaK6j@ix@Uruk^^Emr)Ii6&x6R_O6nWyn7SMUf(@SRE< zxsJb|ifymX(W<&m`3kF8`U3E%=}K1KfiBaBD=OeHePJqULqX5nJInyvSTN(N5>&D{ zOBuhnuxZp47rmQ4kl@hU<&;$*ufD!F1bYNomzS`Y{5vfM?I5zVcVU1Dj=M)_D;3lf zcL%QMRdj2wj0r~1hBR3LD^ap#jG5++pKjFz*z z)GuMOm|7K3o`#Z``a9&0NO~rR3^#iyD9G^I%6>0ZDZ*r0W5BsA9fnX7<-?+o6Ye3( zd-_MTb*wCBbH{I7E(mqn;+=NFLq{U+pC41q;_uo?TVSe}a+7sow;pP}ip1x?%!}?; z*o`r=0Q5lC7`O3_tib#5B1;jeatuzH?y_RLj=I`PC1E(b3xb?U1Zdy_$u8mIMp--rM$^nnLqd}<&$oPZn|$VWg8zA(m`5f_tpOD!_e2*WQ8h(!TIK8h-<@#ic%PFK ziNM9YFa6WMqS8KnT~bBo^Urc2!wQd1o)<)sMO^I5(PN8E#HLZ9PjC9v?vj^RKO*R* zev1?#)%gsA(0zME%_~Bvbe8A*ae0Ea>_6JkE6%ZHrN0cHIkZy30=ZUr2e-`nbZP$sE#7>fq#Vq3g5Z6n_G(D<$cH*ybyIJTw3 zUp@wePoU;4pwAHPYdWVmE6mloiq0?2o&Kpfw4>V8Q0739dq;ib@?O_=hQ_YflLI+e z03aVx&n}ujC}aI}dlGH3_dn5T)n7VJa01tyb%fEWTQqrU3Q?E797MZ!79p`dry7UqW!1KS&pe_x+y2dx zT0VQ?W_CZ3c1wsx3({%UpA6>lQyYC=U;YnT&p=cXk9wgyY z@;dDkB8@JJ$uqtT`?^h7QJNc*Ur8;s=$N*~H*5wx`m79(Ak$leYE`a(O&v1bD(pt& zYh4YIu=dPnB*DwqQv zC(IK%^maT$>oC`~jLf&{}3W7=Iy>ID3 zneGfjMLU)~u2(%9rqEjrmmnTiy4~A2q(-Y@b_9jP?hL$jW9-jd)oul zvHhlK1RPtcMnxc;w?KG{?%%VLfWYh=`3o2Feq{-@_N=#rsyL2AD1?YTNJ!Kg;3HmI zX|eewe_NsDFwG&-Zq_AW>KIu%%QrAqfF8V=KdzCE!}z}!nyh8mvkI}PUk6z~%b1=_ zX#*kBWBQKkp6$Hwg$lQ|wKS9$0u+dxC7Q|`0-u~4NNoy3n}R(fj{&}VR|9@4<(-T< zvs9Px8^1syRP{;+$K4hGbKP?sqb}2z@)j6Hy8_!+O}uqNw(vRnyzc0 z&6VqNlph1itKxxU`KpVGSj1(lJG~|M-IP@Ye$_noESK5zkXuck_Ww0UB3|520_iF* zrOX=U{df&JA)>EHba9pxZ^oJ_+E+K+| zwDoZk)-NKw3o+rOgYp?9!*LtN{>C{@kIK;)Q!?D

n<3)1wrVA#Z-eqO`3qP~T`V zGhmkW!OVRrW0ee^N8{(kX#$0~_HkUh{H&A0nVxpFWSIW-HP1=C+w?WJ3;;R+(g3;{ zkZ)nOpK0a?m7CuMgVZ^MVRFB@^3pY1IzeXS6Mx94z0kyX84tq|qsI$D_q1hUFmogW zme*JjZqEEUcj}eDEDY7orr?d)%T9SV7At_juy~`)aLlTiR&$9$s!S)O-HK>Tq|^tMMWDp4 z2IQqozFVqP#L7_Uh-9d)fFszGQF#L}F!ryDFz^m5G5h^6-Eo?zc}ZcwUqq1O>91Qa zIQb9tYI)xCFvjJ3t3lV6B66Yw@v2|ZLFhoMy<<%>X^i&n_BwHSi-74ON!pDRiehGk zVDwG>#<;p@kK;}%-ni`(I}r!-*z1dnp4wsY+Ll`Wi9R>vk+r5>0Q+_4u|>%83V0xv zm+t*#U%_V*s3+o00GOn!v61kwXSuvXyq@6lfMj<|*T|>o1nqJ!pNNfE=RBLeeSw8* zd&2yY(Ag-NT19H>w)7$#(RkYrxdEVFF_FDgjK&4x$`8J)5=Sdf+}M(J7Kj<}tEk=x zLnu(3Zdv;!YBXvksY<-ZR^e!X9wI~Nmw}f8-Gh! zRWT2#?0JSM_BaJ-N_wX%9bTIDL9_;*_Gv-;RskWB=x3~40wb*Abr>d35x)cz7wLAcf?gdrK|vklSRMs2XzA{i0a3`8nP&#^?sV$ zfAkNT__Nn#*6j#YH<5E4X2!A zkmgl3A-SuN8jfdfif>Z9xSc6vVDcNQlJ_jOw{B}o;=~y!r;5YGM7`sNI?Lxyks!)C zBA~y`q>*1f_Vi3vl!zR$IBK-WdP~c_bZ9UzQvX)^sZeO!aD<_XRKEl0 zn_?%U?9vUKwF@W52Tg?c&6I-SvonWmPdg@FW79zH?!<~Fy8f?tpFBZ-O`HwAD6b2} zWwWbPdRfz0as~OWhJ5>KtiMH!ik{mdy_JOkvcggsgQtKl_5c<9sMv@1EfrU1MidVBpQn;>nIeAWtv#Ix76`SRsT|!;d5~F{cMRw)2eH_VSh{H=l{dJ63yCHLDua#D#87$#S6A@ z<(~+@N!=A0sj_{o*WR3>2cYF$+W{MT@*9it!^ZIGZ42zU+ya z?_KmM8qd>h{?hg&N|rlH9t)w5&Lw9_T*Du@aLfSjvr88#KW&nESQ*t%OibppFU9}l z*NlZe>ex2`9AWjTn`roL;-x@)4<-2P7pjG0VvN=5)m~aK&6i-GiRqPSv+`^H#r9RF zB4`_G0*ow*^(q}6#}kC4crM-ntF;oi*gPx#6ovP_nm6%rOLL^?M0lP$AH;}YEi^st zoh@m%1H3yfu!()pC-h0B_Og{KXvcS94p2lj(+5&q^D_tD@C9T%|Fu@6m|-N_6kI7U2N zlyt(Zkxnd%loZr4y#s{AX}OeId_+R~)|-GOAgL}2dUT}}OhuPi(LcFsTQGFA)@B!6!La`bkR;ZbmRx6h$5TmWhR9rRw0f@S*FT-2 zk9>_$!?;x>%EiQI^i*{EA6r9buIsMU+R#3zxJYY-LC6NlWRSeo3199g>!cl_ghuf;^0h-^N<@gIDw-B9mt zt`=PMpDIGb)+#PU`KT-7JG}tg;3)2}t=B8CDr3#A zBmb3s`-K6p4P>%Y?8UYa0O@!3^oqH&w=no2y4gj}B~hgt|26%-?W4M|}244~|Cx*!tzncrsL zDnReuFG4{?Mq1l^!Alf{hk5I>SmlWSj|Ohrox*k&wWz3ak>tHvp&q@WTu+M7eCG48 zK&d2nT@gPgl$q}NOvn1CTHG>FNABTX1ADG=WpZqa?h`0AZN5n6a+Uqe<)`YU5clB5 z-V(GvjDAOX*uwWIbcG{3)-q6k%+V*>5>VZ>fQ2_tEP=WK-9sh)iGye8+RXXD38`FM zH%<(aK#3^B;p<~B`KapeH#u7>#9ZCgGXHsgnlYbnX}7cZ2iKb>zjEO-(^;hmpltZ9 zOsLdf(ENxl;4@Ik{3(K2wS-obRAG+}8cdBHxj;gBG6I3m!s3H+l+Ys&XK%9lFdx(T z>0kzM0|Iae9I4W0&fVR>zHKBi6M#Zg2p?}2BZ8;UP{&T~T!We{0!)s6qcC@FScWx@yPVzO-i&dF zFqn(n^ zBd0X~Is})rkphzH?t{seL=e@sdXDRGt!8N5F|bGOXu#+xT}^8~wZmHQ?fsD9h^ft? z3yrQ&_90r9YAgcbfv0(^SMcTV?3nc-owtf}l{Q^%^v!0i>``bbZzGzBhF6EN!w@7C`4}I6d*v|wkGoVvFB6556Gh+uW>SgU z$HSd)|Es1;5Ot)~dx+rTcXEX{WU;$1C9c8XE1ld=9xoUaDKyFdAA(tI6%Cwf)@)3& zkI=x%`&z{DAAf4!jZkH>g?)z!lK>^5e;}}2PZ9?w1xhM}rAj=(muE{d?z?y+ylaSI zZ{k;uBM*AHk^h(1Dq&rz;k3?+^X8YYDD;f*@rJumXq&QF*4|e{>)l8HKl?nFp_d%p zi^W#h3^2xI7DncBxnjKvb?4Yu2;ZT$Ypg^@dXWKZwqa6PCu~rX4r&{e@~8P(C@~{iWkOL+9ET zaj|hy=yuRF38d0JJrab)ViJSVq<#q0R_2iGwl)o8N*8DoZaNR~cn`DJ$=Nly9`^Fp z;`N+WeOv4&%UtBM(nEi1XjQa#0GL+#X!hXh{|U-PJ3AhN1IzssC&}kkZx62|(1jbH zw?#98?+3fOvv-_z1p;hs0gW0(l<5t;31mDMp=es+gI97-kK6uVRF%->>Ns+QxS4zRM zMCPWBkt-{iFa{st6RF<9p%l{Kz)cARqOiA8JwHS6V*Li54>J1KH0aZMy;YQA>G=E6 zjYUITwLv|EWq%4uM}`c!3~Ga;4<26y5;02gojH22D|ch=X(V*O*!F2lGT?1n1&aLY zK4~U&+LacNJC)2it~de@m0b^x zAT$n#|5Spe-Pt9Tw=Or4VGIZCSHt`1j@|sWJC5ydrBL9%dB>)wmgazr6vAFYSlJTX9&H=<5o?XaK*s#TAgW zjYH>R@{V_&K~Kqrph+GkCofTj@4p-kJv#@Gi{z%eG_9PmCPhfJZ}9ax3ABys)UW1o zWmZLaG_~1fC!8Xk+^F}-j$dfc7^7EUC+|%(`RB3(g-od5+zAHe9L=ZVCKzb0lnGeV z-qUktA}5+dDx!=lE^V(k}fn&B0j0)ni%A>WUwouG18c{CP`p*nKecP z)_naToPamn-Bc}FtJz6@@AzBNE?P+TCq>3NrZt`B-#@I<3}N$CG!pehGz58>ixil$ zOjm+I?fnx8o!q=PqrIUfeP$i_fp8LRi$2Hv>A+?XOx;f{k9~px#O_aGiaOdcr;BK+ z+5`-0Q!-={J@L*!C>n|e%vUdWZmUWxkN21|6(9xpLSajqKoeFKJaTm6<%&8PoJONT zjg_tL8|)+i(oj}Bd0CCpwgk)MWi({7Rl^ES;=*}px%`wblv!i zl-)8&NT_N)_hS4<%RTSRO*^ecz4FK{q#ej&$I!ly3^_N)tEb*Oy!e_NHs^~{|Ff}q z-u6pjHW&oJj*>B=Fzx-<4ud8DR5u=~1a zg9K`mWt>}vklH7?ABue6ByX~NuIRVOJ@L9j6&(p4$s4|DXUPP&Cz8MOsdMj2=K15d zKb(u|(L>r?YZAPU^|4;GX?bR+qURo-LS~~AYAdD>ji6)$lUA%1u9mW#Zd+lAhjo@` zV+K4?@NW^%3<#0dEq1%($->CrF^3&?s3I@ox=qRKb=!o=Yjzwp+M?VWgGX(+zxHh> zhCBGvM|-GF-<`W1%kM7`VF+S>)`eTXx2e+2_4}d?^GnXywQ>5A8r@21N7r~e>7R`R zYh^w$bH{g}?>fkU#*7;{`Uwmx&v=g1vYFr=GTxnUT+O54K99Sxb_a_BRfs0e1rcef zRnVb#OZmawnOCoJDBWTn4DtO>lMte(*HXHA+E!k^R^eI)U)C%lW<*d=X8eBK-mbP7 z0yn&@w=A_tPKo_qi!CCC$LQr@J>opF@|FB&v=M zTRvO-yRXHb%4y&@XAPSGsD&tXP)Rxoe}!EG?G$^;`~__r`Zv_Oou5J&m;oH?iTVRabF+j3!WpTBH^9L%u*xXvVDEt)ct{f*xg|7mDeQ<)D1(2>}xIa$Bnsh zi=rv5+;z$5u-l5m2?I#;uJ&Sm1mfT+V;ZebkSYev7dpM31jZ{=>F2PEujCR=%mv%%aKEi+ROY-Rt)wKjA6M?yUcPg|J*Q)J|bAR zSnlfak*`xB&7{DGIzHV+Hpb}j#Y5hI>f`NvU;7&S@XMIdeXh5GzM1-~T>$gTJ8zeto~ zazY=G0bSYC{q?>lH9>!>AG(o(M0yRkv7}@bZ3Ip(ng{COl(2|hvR%C;uODAPmMVkC z38oEE?G#-@$_oTBlDDgav;y)zxRPSLL8_Qf-%wa-?ljJ7;UGrlJWRpM(NR?CzJE53 zSV~_0UZ%Ajc98n~t7qAL=OxG1du13DEZdGIojmZ{aq_IlJl3AG9RTuym-eyNfOc$o z*3R@wIOw_To-IrxTbh{-HD!lUL{>vVhgC>zUV#2nvvJ=PiXVs&uG$sr6dIK|5lg^)aK*wD@c)F48ykTEb!47Vr)0!w&={#!FeWw_=_(_gk!CID+N0i zA8LB11hDNc{|(R~Shwi0Wb1jw=biVYS#GB$Mi zXPGH%42toiSQLe>P|%%5bOC=$?gH(2NS*3`#7zb%Q012(@mI(k;V*#eJ&QBCX2PY} z!&Q^q25Z+L0YqKMVlSUZLT+9wetu-q?1S2OosD1s(N~jL5q#}#*X>s8lO3SC6WG(? zj3D-4OVuEXPNfD#2!K0pT%iR$S+KYZomB<^Ik3D9@3OPGf7ICp-&8OLz6T~+76@>U zHdCBhk2uE~taUFbuyg=ewybvH%=Du`MurqZ$^Dk=4{~Mp?p+}yl z=eUI6Q#!N9vC6e(RVj*d%pGlQ%?(Ssc#&&>Uc)pC;|yri-bZp>a<;(!YTuqNnmUJF z=wzy)8JPaC&&k&-O~vQ7wiGdnI;Kl~>bC?~FMqjXn~d&%U$R2B*$JyCtO6SZi~-Uf zJ0?Ad;vufVb3e#h5jfXc?D0AGlGOeyvTjMUxC02zGOrThz0{+S#!qF6PKCvG)7C}4 z+GpM@%l3!F@?#jFvjV0wo37ThC!mbTQU{}c_kX@|$q`jmM|$R0r8ti#l#QC(RsYeR z>SoY(E_#bh(qpV@1@CFN%9Yxtt1P=8@iuu}u`>iE2rt6IW=F8P^zCb!lFve&s>WmV zr z0l@r;>$H*w02XJFebQ?oxCe9Rd%$iQTb@34>xf!5%69~SV4(p}vuD2(CJCDC}>kMQhn5|j~FV_U3^NC2l3_O0yVTH>{2U_w!U!QT!AfW#G}O$Zq)#Rf4fg-dw!}M@wND55 z%#X8pqQOxBb87!0Vh_?H?cFO69NA1KNnvt5G8F9k>1QUGL>#cf6UE54V~O;60xrWE z7de*~_i_kRV3k!Smtk_Pg*$iM9%bby;rOP&-U@(&cj-%-M0>%Xgx(=C+bL~|??kOU z+a)l%4MGehl1pxvr_SG);mgy;hqLKWzQ4zV57y85vT%*}wN&(l5t%BMvIxUNQt^FQj*VfLSUjT`;82U`3}$&VKaUy2 zag&}xXE#zl!(Ak?%VXjH_dkBvnp3$uzzG*`$6@k z!)tjyW5SXOmlm6AviTvUhim5Hs)B)xt)1%B4FfS2oBV4E0Tm_)O&0GiDeVJ z$iV3T#%5b%fMi6lq+SzjrqF_3>VWcJvL zd)2gn-5|Di7Qunxg20{^;SN! zaIl~LVW|RXUiZ(kHf5%pMUNO?h1}?9mX@s{3nxW2v=;;q&N-3}Cpt}8G-Y1x)ut8N@^O!*Eh;NVX-@LemRE?{{90bi9SOqK{&l-1E2i(Y)r04F?l8|#q z(V@73f8=oT)%AcG5*JV%?ni;*pU<|p1N>otb7-LsjqzFFJW+XI-9Kgojj6R46 z_ap5eQLZ?-7|SN#)cpRJ9jO;5t5I|8X zr~ew_8S5KXUHcC#DB3r7lyRj*r_jzJr(}8c3So`eZZa_eA^CO&gvd@DpxSZX@tjXf z)mGuN^z!w2>3@x1b-V_iw;B{z--o9^kOU zD2U>DBl4eldXwF!!0#8fTfg=IMI-W~CqDD{$+TupX?jeUxX6{;k~{f$up;eL z|KijlP8k|4v*smU)-z@Amc0|1EO7WAh5yZ?cIW&KmZ9eK&#uh0@F$ zLDYHN=S=h!sMw||F5T+nk>rv&4(Y^~!e60-W2pm$Fr~VjW3BXF(yMBhK0g7~g(cUR z5tPU%c%;_^{pgD?eW|LqA%W4&PYSi$ddC6w(o}g{>}zG@Siko*vU;jwN`h>3H89pf zVhSN&=_W%|w=lZKcYLJnMAh2b$thfq%W2U?=8c7ZO978m+L!5LUMD;B(}GPDsB*^> zzir!7Cx>2<8Jz;ewbB8Pm7>rPoo#J9cyojBj25&@#`618oI$jP%SOUV?gd}ZE@-vI9 zkci&wFitJe7PUGLMabTpMny5B>e9P_{eusr-QP@23TTsA-7oYQfFiEeR|=0vCIQ*m zes;UgS_g4-D$QWfyyFQn;egpO z4`@MW9|cj4_W(fXu@H5gUfA5_*J`2nafHkQL&S+Mju&V)CZ1`=dQ!qGn*7A6Nrp+0 zz%Br*XN!ur)3;7$y zhem~wdE~G1djy&sda#yVWldC``oP;;cM44_2V1gXO$!O+K{}gvgzXxTPhVSaoxJ*K z_nGT8K><}O@XUJ-Au55)9fOr3-vx3(8&Rik-BXgbrCQ59cr(P z;Wh`w4?*SQ{;sOX5MZgkmN15L#Ks) z3nS-C_o2nQfXoSou;PgAg^yEJwhHb~lji3iK4Xu;7QVJ%8+AUpJFGRFGQFmH1U^xj zSw0Ev2oaZnQJP2E4;=LW__2(vK4RWAUHn1$_82^Fyw5UoDd;|Jm%({Ik{>~p&6VADs3N8rORJ6 zh2u>A-=W>GgthBs%D?R;zhO$^s(YTRSQJaccl)57+plrewfiw!|EFE;qU!le|Av(< zn)B=?&UMx}69Vr~W$okAAcd^b{?6B;_s+)S zmD=5fZIFyr_=wy|{`OVX@jLfOr~%DN5pPV!(Ub&ga;N}isUSohF>{|ilIkjf)Rom$ z`;r7=drFE7ZF!r(`K40WZpMs$#{O5b9yIhh%mPG<_`(EwvwMv3_p))SC_D-H(O#TA zETAx1Afov8gXsb9xsQ&|MvSZ|3K;l}u`wo=kTMJ7Av=AxcHk3vC;Ao1w7X*F77B4_ zb^{#>uY1zZ0ncJ)TXc+#gWV%i$f8T*VfOJ25U@i<>$Pu;OvBb1ZJH<7JqEV;GLoq^ zZQID2P0>W}6#{(>+0deXz)KZ^y`(f`wA9i+fEg1@Vl`X&b-oviqnF1MDC^lnGr%7s ze(jGFmBvcZ%1LgfdfQtux_aP=xtUh0RzYQ+zXN1oRU%G2M?FOZ0&!IoEmR z0_SnOu#jZ}w6Fh561Y0}5ou-0Ns0r1Avprsd}qIkdTBgiNs}ha>671FTu%*qczl#N z+Ja9KvD9TV+o&&mDG>ZN-meuPcc^tf z9nq*2@!9bBh`x_9H%hQI8myeUnd>xnuJIuPqq!oTXsjM0-Wm7`qXcGYngP33hF9Z> z6=y+AI%4Yb4J9p5#LGhmzzIp+z=1dirgw8Fa$8z;x>=ehe{#8(P3j4uN_e*xhoQ>$ ztBCx7V(Bp9kUvyZ9tS!NZ zXXTh{MrT`^2PmBi{ap?J9^f4;xb4!2h`V~9^l=z5)u2-u*-9==birFP=wC3@sF~?` zfCy%yS7rJ09y(N_F*LuZ(u6YV@{OtsZA3Hfs)0kqzT97JNYxDRgLbiORbZ$ewC6z# z#)@rVDcts({3z1OUG`0NV}fm1Rj_F2eP|l=b{3b6aV{f{mtXY(*-YDOZ!^`d+iq`x zy$zllTC?hX!SB^gN6D01^dP?LE9P5BH3c?j;ird?~pB+R^Zpg`!6qjq2nbZaA{Wbcu+=sIz&5<4HF(wCQ{Ci)0^U;rWPUpnk zH>cQ|BaOG=-ziL7xm+IM)X*Nl;vi2Ix^m?xcv8sMK2$-YvHUJIhJUZL|aGUMS$K$E+@!N~6}xzK4M< zEC(6~YeLQ2%_Kr*K`$cS1nNVm6!f@oxEnmTV)L9`hV_D{_Ws~c^jLIIvSR$^3JS<+ zU@zh`2}AK^xb3mSf=1yVrk_Mnowg`^ojVo~lt$G-AVG6IbGEp26XD0QZi`?cf>pL} z-4Z^Iq7Lycyxi*wGsszdJ4`6LoX}lFlMRWD2WJWb12V5|hVo3fF8G;(uxGV?(<9{v zEy8p)rfG23^l;oxcsT^s^z9~fRH}N{jkyL~EAvyb_g3iys(2Qp7aRS&_$#11b~kZM zU5}UWIJD5{_Gm6op%Hp?j2Cra{=;m3CAr6l z$>3QK#Y*?s_E6kcezTUMeA)dgtjvj_k3xxM0Qv57ce03~8xsBMa>R_gPQPf#@LdSo$~epbNqo+G5sa(J1k?!MASLPvn|H17E5_^7PyMfIqnrq^ zYT#q%k)Q1y-r_ryQx|f+s^YkdISQQLb(BY(uT?soeK;FT9&z1>_V_8mb^L%4x{W0m zPyQK2V@r-JB83(M{(U7H>N6iekHXp$RI^9+7Sz!^ithti$1=1IWJ->&OJpr(+9Am- ze`>EnEnluxrtT3D!ta5T)cIIBaKImAKrLs@ODB zA+o+R263e+d{YOLEujC&=dAf{51cPMQ#EF$ruZ%6-HB({dLtsRObu_kaFYcb`twqZ zYgS*J`ocRsQS{ac375Og ziRU3zO&84vysDSM^Apjbmi_!rW*_-5EIS%*jAp4NFQ#oVq{(M`)nx7f?>!!`rX8fg zF|OZAR^gH=`oFbv6ZZljHCPeyCdK){$2P}vNx<}hf_$l<#$!bffg1Pck|5R0^VxpC zvMZYci2Ky@7TGUHx|l#uq-bQ)h{*{*D2!SVzn$xc-g+QLHB6xn8XvzixyJ^K)PoeuE7%EMA*(H-^wcd)L#alEe)`n>CJ%NC!{mj z$7vvz? znUC^gENLYH)K*Edf1#J~@EZHe_Td_#*m=z)MSQZ>3usy%#j&fIq+I4sY?E#8Qdi{! zD98gGeGOr-uCQ}NC*P2DY3Ufv9PvW*jcrtrXmy*|7ObSf1d%v)r1QA++W7$!Rxq)y{lj zv5a-Qb%2SX`?bd`pXCK`6{X(;EEtQcK>N*4+`HW0Tq~hzG~o94RBR9OfOBx_@$v?+ zBs{JALU$O|2e4=ws;k6Yk?#3WK=suwJ))%(uG;}v$;uV^#THO!YDL~c&H9hL|42Gd z;|>y9CpUm0f+F~|z(&c=F7YJfW^KSOm$`P?mBZQajWdZAx9zAo|Grt&o{q&rdFhT5 z`Y_ohp%UJwv!={f>pA*dOM;^=wLg`hz%Is~q-yEbgLS+eD9m{E7ufp_1UM^DoNJX2 zJo8M&)Ux(xupFl{I&j!?TjT-_Z>NwQ54rAe;2{IL|(pHTT9=8v8dy|GjynK7XE&}KLi`@ zO=e1ES#NxuAAv+TBVZ&753FMRX;+tg%*%RgU1)asI<+W_6+-+mgiO(J+S4K3^`@EI zBSbuaO{I*38<{nNCj^8on3Xn3O)tt=LH?eN7xup`@I}Ararv8VR$)a>h=)d-z)lsW zn-`m=1Q8BNT057-LD3Vw02Cs;6n^hWH54npaZ@8f?byU7e6&%5Di6KsrLf4u7vxf! zi6VnTyI8};{Xwgr==EWJt?6Io%7M5seoFkyNQAUGW9%m8gwFvyXw7tce|Sb-UqlI+ zO6l4s-iMRMm(cYRccyNU^~9;#CJ0vWL5YL(;7@aX>i~GvWSc@`l+P-omPUD`v9ff; zK$))Sb8IG|Jyd(D3ZO|f3)Sb2Itzdx;2VrF`y?zNSxy11K%*V_6N3QHDu{r-sOcS4t~q35a2_;$}!*%Xp8 zk2HBJTN(`O35=C9)xEI7%3YD^zp9h%^3x z9=t0UO^js$`RT_((I_>iqisBB>(UT#)yn)v54*srKTG8ZVQf^d=m#ZCzkys@3Y8QA3;0zCWjUgjD^-a3klBJ7{Uh` z@F<0r$gz0YW*W;;E;8Ax6}hU-TtTw;NBho8_4_gtrW7&)SMNxrjTfV0+Dtff>?;~n zog*E8MjXe`Ro3}dQR>ZF2Aai=kiM*;Ok`vUOizN5J$`|E#cBnDx4uwtxt<=XzDPeUTRm@+v+&DQWWv74cWw-)i3ux z$5?kS|4L@}y8nr!!Wy)d>riO0zaeh?>GNw~=*dyltNflqQ@$k>XV>BSA;Png0fb#x zbRJ+k0TBevrf}uzX!UldYTJ^|ar_{HpV{uwjf{2ZB?{^1RI2{1t1NST1?Kd$BG)lA zS4GVmmOa90Cmae9U}g+?iU*8QaS@&ATJ-LVkKv1d7?Av`q=$1f)7U?dvFtosq<7z? z#-$|y%aIJYYh^$K^s+FwKTPbCB3azIN}`aql%mvn9)G!fvX0%}&oY8B@8rw~m)RT3 zRP)m1f!^Uy=98kTOb#3&#lQPeKV#D`hAFh6-(2OAJKuvGV7@KQBezGVZZ9s>_?!eSgi$@7lWw_^%v{U5Kp zu;D~w7@L`9$(cPXZVL0nKH~PSC^zoJObS3)z1>>b2CYH(?2bD;D}B~R-8uN(C+6zD zkODmT?D!_3$L#B^-x}O`69z{30Df4Iy(Ek7IbrbvhywqUI_b$f8`)aiGh~hAM)Z=9 zsuV&|_rUvFFORt;Uc!wlpIK^$XxM{qE0WvL1#jdvH?;JV5J?>duaV%kK_)RbeJivF zvVC__ft1ky2D)H=W>EB0I^q1@*D81{Of1-mSd*v?Cho&7N_UDlU?1@J%>sY$@!qBw z{P}Ha^)go@!?nBw^olvAe<6&Mqu;8moiC!=jloM6aYGhP;jO;Zu0g(EawsY=X)FkE zRYH)c`grv3NKIEs`rd@U5t^H?E%kj+K5V~Sp%!IOpgH;i$qw&+ zbk*sdDCq@D?UG@@bB?WW^%+8Hq!K)@)ppuqT=Pz6M%Fb*v2%tWy~#a=+fO0jm5a#Z ztTKLH3?$|a@$wuc&=HFRjU*Fl;|W%>V)9@Hsql#9Pp5H5PcRcIkEL}NIDh85-FqXH z(aiX%gGKsHJ!jfJ3h|%v-KG;)P5N7LBEsWh+lk& z^mLlODg*eD-CNnK?zxe($B@fB9E2tov5&ov)_sW9^dM*Bu5|9bR|)2m_qYC_UNMm2 zt_P)h*I_4mS@<^aGjRz-fZK@-coJm@31_jc2$0iI#fM9^mOpd$AYhkuxDh}p^HgF{>EEtge$DB#=;$gG_6Tp8HGbKf0!kC2J#gs8&-QK!4WwqsXNL zs;$2uB=v#V0y$8tK_o3Fy=bz`C&ooWZi;G<3j#i_?{2Ot`s>()S;(9q5%|xb_9{tu zJ!F&;>$P~45jW3nFQH%U{>OhrcXi7O5n1zeqx?;%nhJx9soG^7_kf?ZM)>~j3zJLu zs~VHW3#SO!GA01=ikXm*oTq&tw7m#Nqw7tRVGxHtZXZhkhuCO{=8Te%AVr^Gl0gTf zl5N3=si!B63!id3dG1BPuqeoqUu`Y3!?ia+EjzD!W@&AO zUUXE8c!I*AF_3u#^1MIv4s3>(=1t$*j%@OPN6bxWYQ4Xkrc+c@C&0_q7+*q;6yd?! zLZs1*Y<||cHPl?~(S`1ho;SllO>tOBgcHe>%MQqyy?*l+RwYTqRWvh4m)30^w4x9o>-aimd zaSq-77%y8~tzYLTN{+20l+p*ED7teSG5%5){Ar~Xhu>vi*~FvMOX2<3>8NeM-q1J*0`o>c^qEPgm>rJneWW9pp6k^DjWUsFFH9G3WSNjqhi`v}ez zIR<5}PzxZt8si?%lgP3e6Ae0|4D-Xy&2#1%&Xo&*NHtaLL_)dhsf)m2V5c=N8?4tR z!ZZ|<4rYsS>$@K8NpJV$%40=g{28PnZi2U%po*}5lSXh}EC2@8$O$2YUL%y!gfa3* ztTx{_s$I47jtqtH)&OF2iaqi!>w%OC^ceJH5Zr7dTYt_hgNREg{H*D+=u&4^tJuhr rr(tJ;Kcejb00000@Mi{+00FhU0jH@CzlD_7J}?c70ssI200CKA1gSZ- literal 0 HcmV?d00001 diff --git a/data/demo_ml_tibble.rda b/data/demo_ml_tibble.rda new file mode 100644 index 0000000000000000000000000000000000000000..a09f6998557688839c5311432393e9b0ca417d06 GIT binary patch literal 848 zcmV-W1F!u3H+ooF0004LBHlIv03iV!0000G&sfahq@M#6T>vQ&2UKVgRpfklK zA+`t`^LEM5g0JmSTuuD9u`sR7lvVPh#<099W(Y;QY4=Q4DoJ=7Y`4keF@$3UJH^{i zmrgC7E|r+?{-&KT1?NklPzp14tr^+_s{z0dj@OW6FIkXU4r1*Dr!~{{I|wl{NMDuH zw69+fntYI4+>_BV9SMh;Bj$UD&ZBj$l2#8!+Ld^=C*3ZBO+s3NAD#I=lb@pnKx7+fvHr~y29r-OCx0rYHqJ52KaJGsMrjZd0 z4A3WcSpj_2Fp!*`(o;*lXi5JB?Ks_{*q)NA7+!-gHDI!H9zNWraO08zd_g`$uN^oA z{PbRlhr3qq-y(hK@p)2GtC!pyAjHOdle${KwLh!1!u}mSGf%8Xfs_9#wlYWVcsN8I zTnQG18{Y$r1whysA$M@!RSl%@fIkYy6vF>hQmS{Bc(r+&9+#X;ggb))N7V04%Qc5< zC6jS+EZ=GRqp(*%q$ElUL&Vc>N>cTAc8~JRzpqDufxDA)=gdH?3d)K4Xz7|kWou#6 z*K2&xpOP~MMrAjal|z7RB)*-{yjyio<4o=Fl_G|WmA}?@RY}=If5L8`hQA}N+A!9J zsBKq)?sjJM&Til+x6oaushKY;8XdAkC(0h9WAyUS3j@bJjWvrKuM`YQp}%b;JD2iP z3c@)-@GoBNo`y~Z!dyO%t9UsO@TKa>xUQvVDIWxmbeNY6C#NB04pvkn4TokYwHa8Q zi%$R9x4(NH%WaM%XY^lq6h=m9&@P^wNto>Fnv7m z4iDaJ^~wMh#;jP}G#N+jPt4T&$IT$n zrsj@sp9o?z>Mo@!HuD?WClh@siDGoSAZeQgS + dplyr::group_by(.data[[target]]) |> + dplyr::slice_sample(n = 30) |> + dplyr::ungroup() |> + dplyr::select(dplyr::all_of(c( + "genome_id", target, + head(setdiff(colnames(full), c("genome_id", target)), 80) + ))) + +data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +train_data <- rsample::training(data_split) + +wflow <- buildWflow(buildLRModel(), buildRecipe(train_data, use_pca = FALSE)) +grid <- buildTuningGrid("LR", penalty_vec = 10^c(-3, -1), mix_vec = c(0, 0.5, 1)) +tune_res <- tuneGrid(wflow, data_split, grid, n_fold = 2) +demo_fit <- fitBestModel(selectBestModel(tune_res, wflow, "mcc"), train_data) + +usethis::use_data(demo_ml_tibble, demo_fit, overwrite = TRUE, compress = "xz") diff --git a/man/amRml-package.Rd b/man/amRml-package.Rd index bdd7a8b..50cd609 100644 --- a/man/amRml-package.Rd +++ b/man/amRml-package.Rd @@ -19,6 +19,11 @@ Useful links: \author{ \strong{Maintainer}: Janani Ravi \email{janani.ravi@cuanschutz.edu} (\href{https://orcid.org/0000-0001-7443-925X}{ORCID}) +Authors: +\itemize{ + \item Janani Ravi \email{janani.ravi@cuanschutz.edu} (\href{https://orcid.org/0000-0001-7443-925X}{ORCID}) +} + Other contributors: \itemize{ \item Ethan Wolfe \email{ethan.wolfe@cuanschutz.edu} [contributor] diff --git a/man/applyBenjaminiHochberg.Rd b/man/applyBenjaminiHochberg.Rd index 75867ee..40e5897 100644 --- a/man/applyBenjaminiHochberg.Rd +++ b/man/applyBenjaminiHochberg.Rd @@ -17,3 +17,10 @@ Returns Fisher results with added cols for adj_p_value, sig_after_bh, Q \description{ applyBenjaminiHochberg } +\examples{ +fisher_results <- tibble::tibble( + gene = paste0("gene_", 1:5), + p_value = c(0.001, 0.01, 0.04, 0.2, 0.5) +) +applyBenjaminiHochberg(fisher_results, Q = 0.05) +} diff --git a/man/buildLRModel.Rd b/man/buildLRModel.Rd index 3ed514e..344617e 100644 --- a/man/buildLRModel.Rd +++ b/man/buildLRModel.Rd @@ -7,7 +7,7 @@ buildLRModel(multi_class = FALSE) } \arguments{ -\item{multi_class}{\link[arrow:data-type]{arrow::bool} Whether to construct a model for multi-class +\item{multi_class}{\link[arrow:bool]{bool} Whether to construct a model for multi-class classification} } \value{ @@ -16,3 +16,7 @@ A \code{parsnip} \code{logistic_reg} object \description{ Builds a logistic regression model. } +\examples{ +buildLRModel() +buildLRModel(multi_class = TRUE) +} diff --git a/man/buildRecipe.Rd b/man/buildRecipe.Rd index 5c2d790..704dd1e 100644 --- a/man/buildRecipe.Rd +++ b/man/buildRecipe.Rd @@ -11,9 +11,9 @@ buildRecipe(train_data, use_pca = FALSE, pca_threshold = 0.95) training. This can be the output of \code{rsample::training(splitMLInputTibble(ml_input_tibble))}.} -\item{use_pca}{\link[arrow:data-type]{arrow::bool} Set to \code{TRUE} to use PCA instead of all features.} +\item{use_pca}{\link[arrow:bool]{bool} Set to \code{TRUE} to use PCA instead of all features.} -\item{pca_threshold}{\link[pillar:num]{pillar::num} The proportion of total variance for which the +\item{pca_threshold}{\link[pillar:num]{num} The proportion of total variance for which the principle components account} } \value{ @@ -22,3 +22,12 @@ A \code{recipe} object \description{ Specify predictors, outcome, and metadata by building a recipe. } +\examples{ +train <- tibble::tibble( + genome_id = paste0("g", 1:10), + genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 5), + feat_a = rep(c(0L, 1L), 5), + feat_b = rep(c(1L, 0L), 5) +) +buildRecipe(train, use_pca = FALSE) +} diff --git a/man/buildTuningGrid.Rd b/man/buildTuningGrid.Rd index 498f310..1841efe 100644 --- a/man/buildTuningGrid.Rd +++ b/man/buildTuningGrid.Rd @@ -11,13 +11,13 @@ buildTuningGrid( ) } \arguments{ -\item{model}{\link[rlang:vector-construction]{rlang::chr} Currently, logistic regression ("LR") is supported.} +\item{model}{\link[rlang:chr]{chr} Currently, logistic regression ("LR") is supported.} -\item{penalty_vec}{\link[pillar:num]{pillar::num} A vector containing \code{penalty} (regularization +\item{penalty_vec}{\link[pillar:num]{num} A vector containing \code{penalty} (regularization strength) values to try (for logistic regression). Recommended range: 10^-4 to 10^4.} -\item{mix_vec}{\link[pillar:num]{pillar::num} A vector containing \code{mixture} values to try for logistic +\item{mix_vec}{\link[pillar:num]{num} A vector containing \code{mixture} values to try for logistic regression. 0 corresponds to L2 regularization; 1 corresponds to L1; intermediate values (0, 1) correspond to elastic net.} } @@ -27,3 +27,10 @@ A logistic regression tuning grid as a tibble \description{ Builds a tuning grid according to the input model. } +\examples{ +buildTuningGrid( + model = "LR", + penalty_vec = 10^c(-3, -1), + mix_vec = c(0, 0.5, 1) +) +} diff --git a/man/buildWflow.Rd b/man/buildWflow.Rd index 1bf7932..e3a1a6b 100644 --- a/man/buildWflow.Rd +++ b/man/buildWflow.Rd @@ -18,3 +18,14 @@ A \code{workflow} object \description{ Builds a \code{tidymodels} workflow based on an input model and recipe. } +\examples{ +train <- tibble::tibble( + genome_id = paste0("g", 1:10), + genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 5), + feat_a = rep(c(0L, 1L), 5), + feat_b = rep(c(1L, 0L), 5) +) +rec <- buildRecipe(train, use_pca = FALSE) +lr <- buildLRModel() +buildWflow(lr, rec) +} diff --git a/man/calculateEvalMets.Rd b/man/calculateEvalMets.Rd index a06b7e7..c4529a9 100644 --- a/man/calculateEvalMets.Rd +++ b/man/calculateEvalMets.Rd @@ -19,3 +19,20 @@ accuracy, normalized (to a 0 to 1 scale instead of -1 to 1) Matthews correlation coefficient (nMCC), and log2(AUPRC/prior) based on the AMR phenotype predictions by an ML model compared against the actual values. } +\examples{ +preds <- tibble::tibble( + genome_id = paste0("g", 1:10), + genome_drug.resistant_phenotype = factor( + rep(c("Resistant", "Susceptible"), each = 5), + levels = c("Resistant", "Susceptible") + ), + .pred_class = factor( + c("Resistant", "Resistant", "Susceptible", "Resistant", "Susceptible", + "Susceptible", "Resistant", "Susceptible", "Susceptible", "Resistant"), + levels = c("Resistant", "Susceptible") + ), + .pred_Resistant = c(0.9, 0.8, 0.4, 0.7, 0.3, 0.2, 0.6, 0.1, 0.2, 0.55), + .pred_Susceptible = c(0.1, 0.2, 0.6, 0.3, 0.7, 0.8, 0.4, 0.9, 0.8, 0.45) +) +calculateEvalMets(preds) +} diff --git a/man/calculateMinSamples.Rd b/man/calculateMinSamples.Rd index 66a18e1..98347cd 100644 --- a/man/calculateMinSamples.Rd +++ b/man/calculateMinSamples.Rd @@ -25,3 +25,7 @@ Returns the minimum number of total observations needed (one bug-drug combo) to have at least one observation of the rarer class in each CV fold or, in validation mode, across train/val/test. } +\examples{ +calculateMinSamples(n_fold = 5, split = c(1, 0), res_prop = 0.3) +calculateMinSamples(n_fold = 5, split = c(0.6, 0.2), res_prop = 0.1) +} diff --git a/man/computeFeatureFreq.Rd b/man/computeFeatureFreq.Rd index a0b33f1..3c79dd4 100644 --- a/man/computeFeatureFreq.Rd +++ b/man/computeFeatureFreq.Rd @@ -19,3 +19,16 @@ The BH Fisher data table with added feature frequency columns \description{ computeFeatureFreq } +\examples{ +df <- tibble::tibble( + genome_id = paste0("g", 1:10), + genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 5), + gene_a = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0), + gene_b = c(0, 0, 0, 1, 1, 1, 1, 1, 1, 0) +) +encoded <- encodePhenotype(df) +fisher <- applyBenjaminiHochberg( + runFisherTests(encoded$df, encoded$target), Q = 0.05 +) +computeFeatureFreq(encoded$df, fisher, encoded$target) +} diff --git a/man/createMLResultDir.Rd b/man/createMLResultDir.Rd index 2e522b2..ec032f0 100644 --- a/man/createMLResultDir.Rd +++ b/man/createMLResultDir.Rd @@ -42,17 +42,10 @@ Creates a structured directory hierarchy for storing machine learning results including matrices, performance metrics, feature importance, models, and predictions. } \examples{ -\dontrun{ -# Basic directory structure -paths <- createMLResultDir("/path/to/results") - -# LOO analysis stratified by year -paths_loo <- createMLResultDir("/path/to/results", - stratify_by = "year", - LOO = TRUE) - -# MDR analysis -paths_mdr <- createMLResultDir("/path/to/results", MDR = TRUE) -} +out_dir <- file.path(tempdir(), "amRml_createdir_example") +dir.create(out_dir, showWarnings = FALSE, recursive = TRUE) +createMLResultDir(out_dir) +createMLResultDir(out_dir, stratify_by = "year", LOO = TRUE) +createMLResultDir(out_dir, MDR = TRUE) } diff --git a/man/createMLinputList.Rd b/man/createMLinputList.Rd index 2104415..7e15eed 100644 --- a/man/createMLinputList.Rd +++ b/man/createMLinputList.Rd @@ -49,8 +49,9 @@ inputs <- createMLinputList("/path/to/results") # Cross-test with year stratification inputs_ct <- createMLinputList("/path/to/results", - stratify_by = "year", - cross_test = TRUE) + stratify_by = "year", + cross_test = TRUE +) # MDR analysis inputs_mdr <- createMLinputList("/path/to/results", MDR = TRUE) diff --git a/man/demo_fit.Rd b/man/demo_fit.Rd new file mode 100644 index 0000000..b7b38db --- /dev/null +++ b/man/demo_fit.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{demo_fit} +\alias{demo_fit} +\title{Small demo LR fit} +\format{ +A fitted \code{workflow} object (output of \code{\link[=fitBestModel]{fitBestModel()}}). +} +\source{ +Built by \code{inst/scripts/make_demo_data.R}. +} +\usage{ +data(demo_fit) +} +\description{ +A tuned logistic-regression workflow fitted on \link{demo_ml_tibble}, for use +in package examples that require a pre-fitted model. +} +\keyword{datasets} diff --git a/man/demo_ml_tibble.Rd b/man/demo_ml_tibble.Rd new file mode 100644 index 0000000..4a02e3e --- /dev/null +++ b/man/demo_ml_tibble.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{demo_ml_tibble} +\alias{demo_ml_tibble} +\title{Small demo ML input tibble} +\format{ +A tibble with 60 rows and 82 columns: \code{genome_id}, +\code{genome_drug.resistant_phenotype}, and 80 binary feature columns. +} +\source{ +Built by \code{inst/scripts/make_demo_data.R}. +} +\usage{ +data(demo_ml_tibble) +} +\description{ +A subset of the AMP-genes-binary matrix generated from the bundled +\code{Sfl_parquet.duckdb} fixture, trimmed to 60 rows (30 Resistant, 30 +Susceptible) and ~80 feature columns for use in package examples. +} +\keyword{datasets} diff --git a/man/dot-getTargetVarName.Rd b/man/dot-getTargetVarName.Rd index cdbf92d..641ca8d 100644 --- a/man/dot-getTargetVarName.Rd +++ b/man/dot-getTargetVarName.Rd @@ -23,3 +23,12 @@ either \code{rlang::sym("genome_drug.resistant_phenotype")} or Returns the name of the target variable to be used for machine learning: either \code{genome_drug.resistant_phenotype} or \code{resistant_classes} } +\examples{ +ml <- tibble::tibble( + genome_id = paste0("g", 1:4), + genome_drug.resistant_phenotype = c("Resistant", "Susceptible", + "Resistant", "Susceptible"), + feat_a = c(1L, 0L, 1L, 0L) +) +.getTargetVarName(ml) +} diff --git a/man/encodePhenotype.Rd b/man/encodePhenotype.Rd index 7067e7d..14993c2 100644 --- a/man/encodePhenotype.Rd +++ b/man/encodePhenotype.Rd @@ -27,3 +27,14 @@ A list with: \description{ encodePhenotype } +\examples{ +df <- tibble::tibble( + genome_id = paste0("g", 1:6), + genome_drug.resistant_phenotype = c("Resistant", "Susceptible", "Resistant", + "Susceptible", "Resistant", "Susceptible"), + gene_a = c(1L, 0L, 1L, 0L, 1L, 0L), + gene_b = c(0L, 1L, 1L, 0L, 0L, 1L) +) +encoded <- encodePhenotype(df) +encoded$target +} diff --git a/man/extractTopFeats.Rd b/man/extractTopFeats.Rd index c6ab154..b43cf0c 100644 --- a/man/extractTopFeats.Rd +++ b/man/extractTopFeats.Rd @@ -9,14 +9,14 @@ extractTopFeats(fit, prop_vi_top_feats = c(0, 1), n_top_feats = NA) \arguments{ \item{fit}{Best model fit, such as the output of \code{fitBestModel()}} -\item{prop_vi_top_feats}{\link[pillar:num]{pillar::num} A vector of length 2 with elements together +\item{prop_vi_top_feats}{\link[pillar:num]{num} A vector of length 2 with elements together indicating the proportion of total variable importance the top features should comprise. To get the features that contribute to the top 10 to 20\% of total variable importance, for example, set \code{prop_vi_top_feats = c(0.1, 0.2)}. Returns all features by default.} -\item{n_top_feats}{\link[pillar:num]{pillar::num} Number of top features to extract} +\item{n_top_feats}{\link[pillar:num]{num} Number of top features to extract} } \value{ A tibble with a column for top features (\code{Variable}), a column for @@ -27,3 +27,7 @@ per-class columns of importance scores for each \code{Variable}) Returns the top features that an ML model found to be important for predicting the AMR phenotype of a given pathogen. } +\examples{ +data(demo_fit) +extractTopFeats(demo_fit, n_top_feats = 10) +} diff --git a/man/fitBestModel.Rd b/man/fitBestModel.Rd index 2724cb1..cdc7c58 100644 --- a/man/fitBestModel.Rd +++ b/man/fitBestModel.Rd @@ -19,3 +19,15 @@ Best model fit \description{ Fits the best model (output of \code{selectBestModel()}). } +\examples{ +data(demo_ml_tibble) +data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +train <- rsample::training(data_split) +wflow <- buildWflow(buildLRModel(), buildRecipe(train)) +set.seed(1) +tune_res <- tuneGrid(wflow, data_split, + buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)), + n_fold = 2) +best_wflow <- selectBestModel(tune_res, wflow, "mcc") +fitBestModel(best_wflow, train) +} diff --git a/man/getConfusionMatrix.Rd b/man/getConfusionMatrix.Rd index a47b012..1047fba 100644 --- a/man/getConfusionMatrix.Rd +++ b/man/getConfusionMatrix.Rd @@ -17,3 +17,18 @@ Confusion matrix of class \code{conf_mat} Returns the confusion matrix based on the AMR phenotype predictions by an ML model compared against the actual values. } +\examples{ +preds <- tibble::tibble( + genome_id = paste0("g", 1:8), + genome_drug.resistant_phenotype = factor( + rep(c("Resistant", "Susceptible"), each = 4), + levels = c("Resistant", "Susceptible") + ), + .pred_class = factor( + c("Resistant", "Resistant", "Susceptible", "Resistant", + "Susceptible", "Resistant", "Susceptible", "Susceptible"), + levels = c("Resistant", "Susceptible") + ) +) +getConfusionMatrix(preds) +} diff --git a/man/getNumFeat.Rd b/man/getNumFeat.Rd index d616bb2..75f2565 100644 --- a/man/getNumFeat.Rd +++ b/man/getNumFeat.Rd @@ -20,3 +20,12 @@ Number of features \description{ Returns the number of features in an amR pangenome. } +\examples{ +ml <- tibble::tibble( + genome_id = paste0("g", 1:6), + genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 3), + feat_a = c(1L, 0L, 1L, 0L, 1L, 0L), + feat_b = c(0L, 1L, 0L, 1L, 0L, 1L) +) +getNumFeat(ml) +} diff --git a/man/loadMLInputTibble.Rd b/man/loadMLInputTibble.Rd index 0cdb61b..8e3310b 100644 --- a/man/loadMLInputTibble.Rd +++ b/man/loadMLInputTibble.Rd @@ -7,7 +7,7 @@ loadMLInputTibble(parquet_path) } \arguments{ -\item{parquet_path}{\link[rlang:vector-construction]{rlang::chr} Path to a long-format sparse Parquet file from +\item{parquet_path}{\link[rlang:chr]{chr} Path to a long-format sparse Parquet file from upstream processing.} } \value{ @@ -18,3 +18,16 @@ A tibble (one row per genome, one column per feature), plus Reads in a Parquet file as a tibble and converts it from long to wide format to prepare it for ML. } +\examples{ +long <- tibble::tibble( + genome_id = rep(paste0("g", 1:6), each = 2), + feature_id = rep(c("gene_a", "gene_b"), 6), + value = c(1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1), + genome_drug.resistant_phenotype = rep( + rep(c("Resistant", "Susceptible"), each = 3), each = 2 + ) +) +tmp <- tempfile(fileext = ".parquet") +arrow::write_parquet(long, tmp) +loadMLInputTibble(tmp) +} diff --git a/man/plotBaselineComparison.Rd b/man/plotBaselineComparison.Rd index 2b6b94d..b212ab2 100644 --- a/man/plotBaselineComparison.Rd +++ b/man/plotBaselineComparison.Rd @@ -20,3 +20,14 @@ A bar plot with balanced accuracy comparisons per antibiotic Generates a bar plot that compares model performance with and without randomly shuffled AMR phenotype labels. } +\examples{ +non_shuffled <- tibble::tibble( + antibiotic = c("AMP", "CIP", "CRO"), + bal_acc = c(0.88, 0.81, 0.92) +) +shuffled <- tibble::tibble( + antibiotic = c("AMP", "CIP", "CRO"), + bal_acc = c(0.52, 0.49, 0.55) +) +plotBaselineComparison(non_shuffled, shuffled) +} diff --git a/man/plotDefaultEval.Rd b/man/plotDefaultEval.Rd index 25eeaf9..9eb41ad 100644 --- a/man/plotDefaultEval.Rd +++ b/man/plotDefaultEval.Rd @@ -15,16 +15,16 @@ plotDefaultEval( \arguments{ \item{default_eval_tibble}{Output of \code{findOptimalMLDefaults()}} -\item{x_default_eval}{\link[rlang:vector-construction]{rlang::chr} x value of default evaluation plot: "train_prop" +\item{x_default_eval}{\link[rlang:chr]{chr} x value of default evaluation plot: "train_prop" or "n_fold"} -\item{y_default_eval}{\link[rlang:vector-construction]{rlang::chr} y value of default evaluation plot. It can be +\item{y_default_eval}{\link[rlang:chr]{chr} y value of default evaluation plot. It can be "avg_runtime_sec" or one of the following performance metrics: "avg_f1_score", "avg_log2_apop", "avg_bal_acc", or "avg_nmcc"} -\item{xlab}{\link[rlang:vector-construction]{rlang::chr} Label for x axis} +\item{xlab}{\link[rlang:chr]{chr} Label for x axis} -\item{ylab}{\link[rlang:vector-construction]{rlang::chr} Label for y axis} +\item{ylab}{\link[rlang:chr]{chr} Label for y axis} } \value{ A \code{ggplot2} scatterplot (performance metric or runtime vs. @@ -34,3 +34,13 @@ A \code{ggplot2} scatterplot (performance metric or runtime vs. Plots performance metric or runtime vs. training data proportion or number of cross-validation folds, colored by model. } +\examples{ +default_eval <- tibble::tibble( + train_prop = c(0.5, 0.6, 0.7, 0.5, 0.6, 0.7), + n_fold = rep(5, 6), + model = rep(c("LR", "RF"), each = 3), + avg_f1_score = c(0.72, 0.78, 0.83, 0.70, 0.75, 0.80) +) +plotDefaultEval(default_eval, x_default_eval = "train_prop", + y_default_eval = "avg_f1_score") +} diff --git a/man/plotFishers.Rd b/man/plotFishers.Rd index ba515df..53a326f 100644 --- a/man/plotFishers.Rd +++ b/man/plotFishers.Rd @@ -35,9 +35,18 @@ Color explicitly encodes whether a feature passes the BH threshold. Labels are applied only to the top-ranked features to preserve clarity. } \examples{ -\dontrun{ -plotFishers(fisher_results) -plotFishers(fisher_results, label_top_n = 0) -} +long <- tibble::tibble( + genome_id = rep(paste0("g", 1:10), each = 2), + feature_id = rep(c("gene_a", "gene_b"), 10), + value = c(1, 0, 1, 0, 1, 1, 1, 1, 0, 1, + 0, 0, 0, 1, 0, 1, 0, 1, 0, 0), + genome_drug.resistant_phenotype = rep( + rep(c("Resistant", "Susceptible"), each = 5), each = 2 + ) +) +tmp <- tempfile(fileext = ".parquet") +arrow::write_parquet(long, tmp) +fisher_results <- runFishers(tmp, Q = 0.05) +plotFishers(fisher_results, alpha = 0.05, label_top_n = 2) } diff --git a/man/plotPRC.Rd b/man/plotPRC.Rd index e1d3cc4..19fe869 100644 --- a/man/plotPRC.Rd +++ b/man/plotPRC.Rd @@ -17,3 +17,20 @@ A precision-recall curve as a \code{ggplot2} object Plots the precision-recall curve given a set of test data plus predicted AMR phenotypes. } +\examples{ +preds <- tibble::tibble( + genome_id = paste0("g", 1:10), + genome_drug.resistant_phenotype = factor( + rep(c("Resistant", "Susceptible"), each = 5), + levels = c("Resistant", "Susceptible") + ), + .pred_class = factor( + c("Resistant", "Resistant", "Susceptible", "Resistant", "Susceptible", + "Susceptible", "Resistant", "Susceptible", "Susceptible", "Resistant"), + levels = c("Resistant", "Susceptible") + ), + .pred_Resistant = c(0.9, 0.8, 0.4, 0.7, 0.3, 0.2, 0.6, 0.1, 0.2, 0.55), + .pred_Susceptible = c(0.1, 0.2, 0.6, 0.3, 0.7, 0.8, 0.4, 0.9, 0.8, 0.45) +) +plotPRC(preds) +} diff --git a/man/plotTopFeatsVI.Rd b/man/plotTopFeatsVI.Rd index e7d3008..9bd920b 100644 --- a/man/plotTopFeatsVI.Rd +++ b/man/plotTopFeatsVI.Rd @@ -9,7 +9,7 @@ plotTopFeatsVI(fit, n_top_feats = 10) \arguments{ \item{fit}{Best model fit, such as the output of \code{fitBestModel()}} -\item{n_top_feats}{\link[pillar:num]{pillar::num} Number of top features to plot} +\item{n_top_feats}{\link[pillar:num]{num} Number of top features to plot} } \value{ Variable importance plot (a \code{ggplot2} object) @@ -18,3 +18,7 @@ Variable importance plot (a \code{ggplot2} object) Generates a plot showing the top features and their variable importance scores. } +\examples{ +data(demo_fit) +plotTopFeatsVI(demo_fit, n_top_feats = 10) +} diff --git a/man/predictML.Rd b/man/predictML.Rd index 2da4b8e..53a882c 100644 --- a/man/predictML.Rd +++ b/man/predictML.Rd @@ -21,3 +21,9 @@ labels Predicts the AMR phenotype in the test data set based on the best fit ML model. } +\examples{ +data(demo_ml_tibble) +data(demo_fit) +data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +predictML(demo_fit, rsample::testing(data_split)) +} diff --git a/man/removeTopFeats.Rd b/man/removeTopFeats.Rd index a6fe1ea..fe4bd79 100644 --- a/man/removeTopFeats.Rd +++ b/man/removeTopFeats.Rd @@ -19,3 +19,14 @@ An amR pangenome with top features removed \description{ Removes top features from a pangenome given previous ML results. } +\examples{ +ml <- tibble::tibble( + genome_id = paste0("g", 1:6), + genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 3), + feat_a = c(1L, 0L, 1L, 0L, 1L, 0L), + feat_b = c(0L, 1L, 0L, 1L, 0L, 1L), + feat_c = c(1L, 1L, 0L, 0L, 1L, 0L) +) +top <- tibble::tibble(Variable = c("feat_a", "feat_c")) +removeTopFeats(ml, top) +} diff --git a/man/runFisherTests.Rd b/man/runFisherTests.Rd index 96c1fa6..676b36c 100644 --- a/man/runFisherTests.Rd +++ b/man/runFisherTests.Rd @@ -19,3 +19,13 @@ A tibble with columns: gene, p_value \description{ runFisherTests } +\examples{ +df <- tibble::tibble( + genome_id = paste0("g", 1:10), + genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 5), + gene_a = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0), + gene_b = c(0, 0, 0, 1, 1, 1, 1, 1, 1, 0) +) +encoded <- encodePhenotype(df) +runFisherTests(encoded$df, encoded$target) +} diff --git a/man/runFishers.Rd b/man/runFishers.Rd index f654d2d..d90593c 100644 --- a/man/runFishers.Rd +++ b/man/runFishers.Rd @@ -29,3 +29,17 @@ The Fisher test results, BH correction, and feature frequencies \description{ runFishers } +\examples{ +long <- tibble::tibble( + genome_id = rep(paste0("g", 1:10), each = 2), + feature_id = rep(c("gene_a", "gene_b"), 10), + value = c(1, 0, 1, 0, 1, 1, 1, 1, 0, 1, + 0, 0, 0, 1, 0, 1, 0, 1, 0, 0), + genome_drug.resistant_phenotype = rep( + rep(c("Resistant", "Susceptible"), each = 5), each = 2 + ) +) +tmp <- tempfile(fileext = ".parquet") +arrow::write_parquet(long, tmp) +runFishers(tmp, Q = 0.05) +} diff --git a/man/runIFE.Rd b/man/runIFE.Rd index 312ceba..b9d2ce7 100644 --- a/man/runIFE.Rd +++ b/man/runIFE.Rd @@ -19,28 +19,28 @@ runIFE( \arguments{ \item{ml_input_tibble}{An ML-ready tibble generated by \code{loadMLInputTibble()}} -\item{by_num}{\link[arrow:data-type]{arrow::bool} Set to \code{TRUE} if removing top features as a percentage +\item{by_num}{\link[arrow:bool]{bool} Set to \code{TRUE} if removing top features as a percentage of the total number of features.} -\item{by_vi}{\link[arrow:data-type]{arrow::bool} Set to \code{TRUE} if removing top features by their +\item{by_vi}{\link[arrow:bool]{bool} Set to \code{TRUE} if removing top features by their contribution to the total variable importance.} -\item{percent_removal_vec}{\link[pillar:num]{pillar::num} A vector of percentages of total top +\item{percent_removal_vec}{\link[pillar:num]{num} A vector of percentages of total top features removed (if \code{by_num = TRUE} and \code{by_vi = FALSE}) or percent contributions of top features removed to the total variable importance (if \code{by_vi = TRUE} and \code{by_num = FALSE}) at each iteration. The function will automatically train with all features to start, so there is no need to specify 0 in this vector.} -\item{mix_vec}{\link[pillar:num]{pillar::num} A vector containing mixture values (0 corresponds +\item{mix_vec}{\link[pillar:num]{num} A vector containing mixture values (0 corresponds to L2 regularization; 1 corresponds to L1; intermediate values correspond to elastic net) to try. The default corresponds to L2 for the purpose of IFE analysis.} -\item{return_feats}{\link[arrow:data-type]{arrow::bool} Whether to return top features from each iteration +\item{return_feats}{\link[arrow:bool]{bool} Whether to return top features from each iteration of IFE.} -\item{verbose}{\link[arrow:data-type]{arrow::bool} The function will stay quiet if set to \code{FALSE}.} +\item{verbose}{\link[arrow:bool]{bool} The function will stay quiet if set to \code{FALSE}.} } \value{ A tibble with IFE performance (note: this will be returned within a @@ -51,3 +51,12 @@ runIFE Removes top features identified by ML models and retrains iteratively; returns nMCC at each iteration. } +\examples{ +data(demo_ml_tibble) +set.seed(1) +runIFE( + demo_ml_tibble, by_num = TRUE, by_vi = FALSE, + percent_removal_vec = c(25, 50), mix_vec = 0, + return_feats = TRUE, verbose = FALSE +) +} diff --git a/man/runMDRmodels.Rd b/man/runMDRmodels.Rd index 325c14a..74be1e3 100644 --- a/man/runMDRmodels.Rd +++ b/man/runMDRmodels.Rd @@ -52,7 +52,7 @@ NULL (invisible). Called for side effects (model training and result saving). } \description{ Executes machine learning pipeline for MDR analysis using logistic regression -with parallel processing via the future backend. Trains models on all MDR +with parallel processing via BiocParallel. Trains models on all MDR parquet files and saves results to designated output directories. } \examples{ @@ -62,13 +62,15 @@ runMDRmodels("/path/to/results") # Run with more threads and minimal output runMDRmodels("/path/to/results", - threads = 32, - verbose = FALSE) + threads = 32, + verbose = FALSE +) # Run without saving model fits (save disk space) runMDRmodels("/path/to/results", - threads = 16, - return_fit = FALSE) + threads = 16, + return_fit = FALSE +) } } diff --git a/man/runMLPipeline.Rd b/man/runMLPipeline.Rd index 31c542e..2b53951 100644 --- a/man/runMLPipeline.Rd +++ b/man/runMLPipeline.Rd @@ -35,10 +35,10 @@ classification for one bug/drug combination) or \code{resistant_classes} (multi-class classification for determining the drug classes to which each genome is resistant), but not both.} -\item{model}{\link[rlang:vector-construction]{rlang::chr} Logistic regression ("LR"), random forest ("RF"), or +\item{model}{\link[rlang:chr]{chr} Logistic regression ("LR"), random forest ("RF"), or boosted tree ("BT")} -\item{split}{\link[pillar:num]{pillar::num} Vector of length 2 indicating the proportion of data to +\item{split}{\link[pillar:num]{num} Vector of length 2 indicating the proportion of data to be designated as training and validation, respectively. Note: if \code{test_data} is provided, these numbers will be scaled so that they sum to 1 and will still represent fractions of \code{ml_input_tibble} (not including the input @@ -47,26 +47,26 @@ function is not equipped to handle this. If cross-validation is enabled here \code{split = c(1,0)}, we will still retain a 20\% test holdout for final reporting. Cross-validation is run on the 80\% training portion, and not on the testing set.} -\item{n_fold}{\link[pillar:num]{pillar::num} Number of folds of cross-validation} +\item{n_fold}{\link[pillar:num]{num} Number of folds of cross-validation} -\item{prop_vi_top_feats}{\link[pillar:num]{pillar::num} A vector of length 2 with elements together +\item{prop_vi_top_feats}{\link[pillar:num]{num} A vector of length 2 with elements together indicating the proportion of total variable importance the top features should comprise. To get the features that contribute to the top 10 to 20\% of total variable importance, for example, \verb{set prop_vi_top_feats = c(0.1, 0.2)}. Returns all features by default.} -\item{n_top_feats}{\link[pillar:num]{pillar::num} Number of top features to extract per drug} +\item{n_top_feats}{\link[pillar:num]{num} Number of top features to extract per drug} -\item{use_pca}{\link[arrow:data-type]{arrow::bool} Set to \code{TRUE} to use PCA instead of all features.} +\item{use_pca}{\link[arrow:bool]{bool} Set to \code{TRUE} to use PCA instead of all features.} -\item{pca_threshold}{\link[pillar:num]{pillar::num} The proportion of total variance for which the +\item{pca_threshold}{\link[pillar:num]{num} The proportion of total variance for which the principle components account} -\item{penalty_vec}{\link[pillar:num]{pillar::num} A vector containing \code{penalty} (regularization +\item{penalty_vec}{\link[pillar:num]{num} A vector containing \code{penalty} (regularization strength) values to try (for logistic regression). It is recommended to choose values \code{10^-4} to \code{10^4}.} -\item{mix_vec}{\link[pillar:num]{pillar::num} A vector containing \code{mixture} values to try for logistic +\item{mix_vec}{\link[pillar:num]{num} A vector containing \code{mixture} values to try for logistic regression. 0 corresponds to L2 regularization; 1 corresponds to L1; intermediate values correspond to elastic net.} @@ -78,12 +78,12 @@ or boosted tree. It is recommended to choose values in the range 1 to 100.} \code{trees} in random forest or boosted tree. It is recommended to choose values in the range 100 to 1000.} -\item{select_best_metric}{\link[rlang:vector-construction]{rlang::chr} Metric to select best model: "f_meas", +\item{select_best_metric}{\link[rlang:chr]{chr} Metric to select best model: "f_meas", "pr_auc", or "bal_accuracy"} -\item{seed}{\link[pillar:num]{pillar::num} For reproducible analysis} +\item{seed}{\link[pillar:num]{num} For reproducible analysis} -\item{shuffle_labels}{\link[arrow:data-type]{arrow::bool} Set to \code{TRUE} to randomly shuffle AMR phenotype +\item{shuffle_labels}{\link[arrow:bool]{bool} Set to \code{TRUE} to randomly shuffle AMR phenotype labels for baseline comparisons.} \item{test_data}{A tibble to use as testing data instead of a subset of @@ -91,14 +91,14 @@ labels for baseline comparisons.} or temporal holdouts. The \code{split} argument still tells how \code{ml_input_tibble} should be divided for training and validation.} -\item{return_tune_res}{\link[arrow:data-type]{arrow::bool} Set to \code{TRUE} to return tuning results.} +\item{return_tune_res}{\link[arrow:bool]{bool} Set to \code{TRUE} to return tuning results.} -\item{return_fit}{\link[arrow:data-type]{arrow::bool} Set to \code{TRUE} to return the model \code{fit}.} +\item{return_fit}{\link[arrow:bool]{bool} Set to \code{TRUE} to return the model \code{fit}.} -\item{return_pred}{\link[arrow:data-type]{arrow::bool} Set to \code{TRUE} to return the predicted and actual +\item{return_pred}{\link[arrow:bool]{bool} Set to \code{TRUE} to return the predicted and actual AMR phenotypes.} -\item{verbose}{\link[arrow:data-type]{arrow::bool} The function will stay quiet if set to \code{FALSE}.} +\item{verbose}{\link[arrow:bool]{bool} The function will stay quiet if set to \code{FALSE}.} } \value{ A list with two elements: A \code{performance_tibble} and a @@ -109,3 +109,13 @@ respectively, are set to \code{TRUE}. \description{ Stitches together core ML functions into one pipeline. } +\examples{ +data(demo_ml_tibble) +set.seed(1) +runMLPipeline( + ml_input_tibble = demo_ml_tibble, model = "LR", + split = c(1, 0), n_fold = 2, + penalty_vec = 10^c(-3, -1), mix_vec = c(0, 0.5, 1), + n_top_feats = 10, verbose = FALSE +) +} diff --git a/man/runMLmodels.Rd b/man/runMLmodels.Rd index cb44718..501e8d8 100644 --- a/man/runMLmodels.Rd +++ b/man/runMLmodels.Rd @@ -121,8 +121,7 @@ For example: \code{"LOO_cross_test_ML_year_performance.tsv"} \note{ This function requires the following packages: \itemize{ -\item \pkg{future} - for parallel processing backend -\item \pkg{future.apply} - for parallel lapply +\item \pkg{BiocParallel} - for parallel processing backend \item \pkg{readr} - for reading/writing TSV files \item \pkg{dplyr}, \pkg{purrr}, \pkg{stringr}, \pkg{tibble} - for data manipulation } @@ -143,21 +142,24 @@ runMLmodels("/path/to/results", stratify_by = "year") # Cross-test with year stratification runMLmodels("/path/to/results", - stratify_by = "year", - cross_test = TRUE, - threads = 32) + stratify_by = "year", + cross_test = TRUE, + threads = 32 +) # LOO analysis stratified by country with cross-testing runMLmodels("/path/to/results", - stratify_by = "country", - LOO = TRUE, - cross_test = TRUE, - verbose = TRUE) + stratify_by = "country", + LOO = TRUE, + cross_test = TRUE, + verbose = TRUE +) # Run without saving model fits (save disk space) runMLmodels("/path/to/results", - stratify_by = "year", - return_fit = FALSE) + stratify_by = "year", + return_fit = FALSE +) } } diff --git a/man/runModelingPipeline.Rd b/man/runModelingPipeline.Rd index 84ea022..a0763be 100644 --- a/man/runModelingPipeline.Rd +++ b/man/runModelingPipeline.Rd @@ -49,3 +49,14 @@ Given a DuckDB file produced by \code{runDataProcessing()}, it: \item Saves performance metrics, fitted models, predictions, and top feature rankings } } +\examples{ +\dontrun{ +runModelingPipeline( + parquet_duckdb_path = "path/to/Bug_parquet.duckdb", + threads = 16, n_fold = 5, split = c(1, 0), min_n = 25, + prop_vi_top_feats = c(0, 1), pca_threshold = 0.99, + verbose = TRUE, use_saved_split = TRUE +) +} + +} diff --git a/man/selectBestModel.Rd b/man/selectBestModel.Rd index 23232bf..c1f6f2d 100644 --- a/man/selectBestModel.Rd +++ b/man/selectBestModel.Rd @@ -11,7 +11,7 @@ selectBestModel(tune_res, wflow, select_best_metric = "mcc") \item{wflow}{A \code{workflows} object, such as the output of \code{buildWflow()}} -\item{select_best_metric}{\link[rlang:vector-construction]{rlang::chr} Metric to select best model: "f_meas", +\item{select_best_metric}{\link[rlang:chr]{chr} Metric to select best model: "f_meas", "pr_auc", "mcc", or "bal_accuracy"} } \value{ @@ -20,3 +20,14 @@ Best model workflow \description{ Selects best model by F1 score, AUPRC, MCC, or balanced accuracy. } +\examples{ +data(demo_ml_tibble) +data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +wflow <- buildWflow(buildLRModel(), + buildRecipe(rsample::training(data_split))) +set.seed(1) +tune_res <- tuneGrid(wflow, data_split, + buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)), + n_fold = 2) +selectBestModel(tune_res, wflow, "mcc") +} diff --git a/man/shuffleLabels.Rd b/man/shuffleLabels.Rd index 9c672b5..83a6c72 100644 --- a/man/shuffleLabels.Rd +++ b/man/shuffleLabels.Rd @@ -14,7 +14,7 @@ classification for one bug/drug combination) or \code{resistant_classes} (multi-class classification for determining the drug classes to which each genome is resistant), but not both.} -\item{seed}{\link[pillar:num]{pillar::num} For reproducible analysis} +\item{seed}{\link[pillar:num]{num} For reproducible analysis} } \value{ Pangenome with randomly shuffled AMR phenotype labels @@ -22,3 +22,11 @@ Pangenome with randomly shuffled AMR phenotype labels \description{ Randomly shuffles the AMR phenotype labels in the ML-ready tibble. } +\examples{ +ml <- tibble::tibble( + genome_id = paste0("g", 1:6), + genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 3), + feat_a = c(1L, 0L, 1L, 0L, 1L, 0L) +) +shuffleLabels(ml, seed = 42) +} diff --git a/man/splitMLInputTibble.Rd b/man/splitMLInputTibble.Rd index 7f774b9..bfa3254 100644 --- a/man/splitMLInputTibble.Rd +++ b/man/splitMLInputTibble.Rd @@ -14,10 +14,10 @@ classification for one bug/drug combination) or \code{resistant_classes} (multi-class classification for determining the drug classes to which each genome is resistant), but not both.} -\item{split}{\link[pillar:num]{pillar::num} Vector of length 2 indicating the proportion of data to +\item{split}{\link[pillar:num]{num} Vector of length 2 indicating the proportion of data to be designated as training and validation, respectively.} -\item{seed}{\link[pillar:num]{pillar::num} For reproducible analysis} +\item{seed}{\link[pillar:num]{num} For reproducible analysis} } \value{ An \code{rsplit} object @@ -26,3 +26,12 @@ An \code{rsplit} object Splits an ML-ready tibble into training and testing (and in some cases validation) sets. } +\examples{ +ml <- tibble::tibble( + genome_id = paste0("g", 1:20), + genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 10), + feat_a = rep(c(0L, 1L), 10), + feat_b = rep(c(1L, 0L), 10) +) +splitMLInputTibble(ml, split = c(1, 0), seed = 42) +} diff --git a/man/tuneGrid.Rd b/man/tuneGrid.Rd index d03a8c3..90fc4e9 100644 --- a/man/tuneGrid.Rd +++ b/man/tuneGrid.Rd @@ -15,7 +15,7 @@ tuneGrid(wflow, data_split, grid = buildTuningGrid(model = "LR"), n_fold = 5) \item{grid}{A tuning grid as a tibble, such as the output of \code{buildTuningGrid()}} -\item{n_fold}{\link[pillar:num]{pillar::num} Number of folds of cross-validation} +\item{n_fold}{\link[pillar:num]{num} Number of folds of cross-validation} } \value{ Results of grid tuning @@ -24,3 +24,12 @@ Results of grid tuning Tunes the model according to the tuning grid, workflow, and data split specified. } +\examples{ +data(demo_ml_tibble) +data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +wflow <- buildWflow(buildLRModel(), + buildRecipe(rsample::training(data_split))) +grid <- buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)) +set.seed(1) +tuneGrid(wflow, data_split, grid, n_fold = 2) +} diff --git a/vignettes/intro.Rmd b/vignettes/intro.Rmd index af5bc8e..89f3b18 100644 --- a/vignettes/intro.Rmd +++ b/vignettes/intro.Rmd @@ -1,17 +1,19 @@ --- title: "Introduction to amRml" -output: rmarkdown::html_vignette +output: + BiocStyle::html_document: + fig_width: 7 + fig_height: 5 vignette: > %\VignetteIndexEntry{Introduction to amRml} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- -```{r, include = FALSE} +```{r setup-opts, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, - comment = "#>", - eval = FALSE + comment = "#>" ) ``` @@ -21,354 +23,285 @@ library(amRml) ## Overview -`amRml` is part of a 3-package suite for predicting antimicrobial resistance (AMR) using machine learning models trained on bacterial genomic features. +`amRml` predicts antimicrobial resistance (AMR) from bacterial genomic features. It consumes a DuckDB produced by `amRdata` and produces ML matrices, tuned logistic regression models, per-genome predictions, feature importances, and Fisher's exact tests as a non-ML baseline. -This vignette demonstrates how to generate ML input matrices, train models, and visualize results. +This vignette uses the *Shigella flexneri* (`Sfl`) DuckDB bundled in `inst/extdata`. + +```{r fixture} +fixture <- system.file("extdata", "Sfl_parquet.duckdb", package = "amRml") +out_dir <- file.path(tempdir(), "amRml_vignette") +dir.create(out_dir, showWarnings = FALSE, recursive = TRUE) +``` ## Generating ML input matrices -After data curation with `amRdata`, use `generateMLInputs()` to create ML-ready matrices from the DuckDB containing metadata and feature parquets. -The DuckDB with the parquet views is created with `cleanData()` +`generateMLInputs()` reads the bug-level DuckDB (metadata + feature parquets) and writes one long-format sparse parquet per drug × feature × encoding combination into `out_path/matrix/`. With `stratify_by`, it additionally writes year- or country-stratified matrices into `matrix_year/` or `matrix_country/`. ```{r generate-matrices} -# Generate all ML input matrices from curated data generateMLInputs( - parquet_duckdb_path = "results/Sfl_parquet.duckdb", - out_path = "results/", - n_fold = 5, - split = c(1, 0), # CV mode (or use split = 0 as shorthand) - min_n = 25, - verbosity = "minimal" + parquet_duckdb_path = fixture, + out_path = out_dir, + n_fold = 5, + split = c(1, 0), + min_n = 25, + verbosity = "minimal" ) -``` - -This generates: - -- Drug and drug-class matrices for all feature types (genes, proteins, domains, struct) -- Year-stratified and country-stratified matrices -- Leave-one-out (LOO) matrices for temporal/geographic holdouts -- Multi-drug resistance (MDR) matrices - -For classical train/validation/test splits instead of cross-validation: -```{r generate-splits} -# 70% train, 15% validation, 15% test -generateMLInputs( - parquet_duckdb_path = "results/Sfl_parquet.duckdb", - out_path = "results/", - n_fold = NULL, - split = c(0.7, 0.15), - verbosity = "debug" -) +list.files(file.path(out_dir, "matrix"))[1:5] ``` -Use `skipImbalancedMatrix()` to check if a drug-bug combination has sufficient samples: +For a classical train/validation/test split instead of cross-validation, set `n_fold = NULL` and pass a length-2 `split` of `c(train_prop, val_prop)`; the test proportion is the remainder. -```{r skip-check} -skip <- skipImbalancedMatrix( - genome_ids = genome_ids, - phenotype_counts = phenotype_counts, - n_fold = 5, - split = c(1, 0), - min_total_obs = 40 +```{r generate-splits, eval = FALSE} +generateMLInputs( + parquet_duckdb_path = fixture, + out_path = out_dir, + n_fold = NULL, + split = c(0.7, 0.15), + verbosity = "debug" ) ``` -## Loading ML matrices +## Loading a single matrix -The package expects Parquet files in long (sparse) format with these columns: +The generated parquets are long-format sparse tibbles with these columns: | Column | Description | |--------|-------------| | `genome_id` | Unique identifier for each isolate | | `feature_id` | Feature name (gene, protein, domain, or struct) | -| `value` | Binary presence/absence (0 or 1) or count | -| `genome_drug.resistant_phenotype` | "Resistant" or "Susceptible" | +| `value` | Binary presence/absence (0/1) or count | +| `genome_drug.resistant_phenotype` | `"Resistant"` or `"Susceptible"` | -`loadMLInputTibble()` converts this to wide format (one row per genome, one column per feature) for ML. +`loadMLInputTibble()` converts one of them to wide format (one row per genome, one column per feature) ready for ML. ```{r load-data} -# Load a generated matrix -ml_tibble <- loadMLInputTibble( - parquet_path = "results/matrix/Sfl_drug_AMP_domains_binary_sparse.parquet" +matrix_path <- file.path( + out_dir, "matrix", "Sfl_drug_AMP_genes_binary_sparse.parquet" ) -# Check dimensions +ml_tibble <- loadMLInputTibble(matrix_path) n_features <- getNumFeat(ml_tibble) -target_var <- getTargetVarName(ml_tibble) +target_var <- .getTargetVarName(ml_tibble) + +c(n_features = n_features, target_var = target_var) ``` -## Main pipeline +## Per-matrix pipeline -The primary entry point is `runMLPipeline()`, which orchestrates the entire ML workflow: +`runMLPipeline()` runs the train/tune/fit/predict pipeline on a single matrix in memory. Use it to iterate on one drug-feature combo before scaling to all of them. ```{r run-pipeline} results <- runMLPipeline( - ml_input_tibble = ml_tibble, - model = "LR", - split = c(0.6, 0.2), - n_fold = 2, - n_top_feats = 20, + ml_input_tibble = ml_tibble, + model = "LR", + split = c(1, 0), + n_fold = 2, + n_top_feats = 20, + penalty_vec = 10^c(-3, -1), + mix_vec = c(0, 0.5, 1), select_best_metric = "mcc", - return_fit = TRUE, - return_pred = TRUE, - verbose = TRUE + return_fit = TRUE, + return_pred = TRUE, + verbose = FALSE ) -# View results results$performance_tibble -results$top_feat_tibble +head(results$top_feat_tibble) ``` ### Output structure -`runMLPipeline()` returns a list with: +`runMLPipeline()` returns a named list: -**`performance_tibble`** - model performance metrics: +**`performance_tibble`** — one row of model performance metrics: | Column | Description | |--------|-------------| | `num_obs` | Number of observations | | `res_prop` | Proportion of resistant samples | | `n_feat` | Number of features | -| `model` | Model type (LR) | +| `model` | Model type (`"LR"`) | | `train_prop`, `val_prop` | Train/validation split proportions | | `fit_penalty`, `fit_mixture` | Fitted hyperparameters | | `nmcc`, `f1`, `bal_acc`, `log2_apop` | Performance metrics | | `run_time_sec` | Runtime in seconds | -**`top_feat_tibble`** - ranked feature importance: +**`top_feat_tibble`** — ranked feature importance: | Column | Description | |--------|-------------| | `Variable` | Feature name | | `Importance` | Variable importance score | -| `Sign` | Direction of effect (POS = resistance, NEG = susceptibility) | +| `Sign` | Direction of effect (`POS` = associated with resistance, `NEG` = with susceptibility) | **Optional outputs** (when `return_*` = TRUE): -- `tune_res` - tuning results from grid search -- `fit` - fitted model workflow object -- `pred` - predictions with `.pred_class`, `.pred_Resistant`, `.pred_Susceptible` +- `tune_res` — tuning results from grid search +- `fit` — the fitted workflow object +- `pred` — predictions with `.pred_class`, `.pred_Resistant`, `.pred_Susceptible` ## Step-by-step model building -For more control, use the individual tidymodels-based functions: +The builders below are what `runMLPipeline()` chains internally. Call them directly when you need control over any individual step. ```{r split-data} -# Split data data_split <- splitMLInputTibble(ml_tibble, split = c(0.6, 0.2), seed = 123) - -library(rsample) -train_data <- training(data_split) -test_data <- testing(data_split) +train_data <- rsample::training(data_split) +test_data <- rsample::testing(data_split) ``` ```{r build-model} -# Build recipe (preprocessing) -recipe <- buildRecipe(train_data, use_pca = FALSE, pca_threshold = 0.95) +recipe <- buildRecipe(train_data, use_pca = FALSE) +lr_mod <- buildLRModel(multi_class = FALSE) +wflow <- buildWflow(lr_mod, recipe) -# Build logistic regression model -lr_model <- buildLRModel(multi_class = FALSE) - -# Build workflow -wflow <- buildWflow(lr_model, recipe) - -# Build tuning grid grid <- buildTuningGrid( - model = "LR", - penalty_vec = 10^seq(-4, -1, length.out = 10), - mix_vec = 0:5 / 5 + model = "LR", + penalty_vec = 10^c(-3, -1), + mix_vec = c(0, 0.5, 1) ) ``` ```{r tune-fit} -# Tune hyperparameters -tune_res <- tuneGrid(wflow, data_split, grid, n_fold = 5) - -# Select and fit best model +tune_res <- tuneGrid(wflow, data_split, grid, n_fold = 2) best_wflow <- selectBestModel(tune_res, wflow, select_best_metric = "mcc") -fit <- fitBestModel(best_wflow, train_data) +fit <- fitBestModel(best_wflow, train_data) -# Get fitted hyperparameters -fit_hps <- getFitHps(fit) - -# Make predictions -predictions <- predict(fit, test_data) +preds <- predictML(fit, test_data) ``` ## Performance metrics +`calculateEvalMets()` returns all of nMCC, F1, balanced accuracy, AUPRC, log2(AUPRC/prior), sensitivity, and specificity from a tibble of predictions + truth. + ```{r metrics} -# Individual metrics -nmcc <- calculatenMCC(predictions) # Normalized MCC (0-1 scale) -f1 <- calculateF1(predictions) # F1 score -bal_acc <- calculateBalAcc(predictions) # Balanced accuracy -auprc <- calculateAUPRC(predictions) # Area under PR curve -log2_apop <- calculateLog2APOP(predictions) # log2(AUPRC/prior) - -# All metrics at once -all_metrics <- calculateEvalMets(predictions) - -# Confusion matrix -conf_mat <- getConfusionMatrix(predictions) +calculateEvalMets(preds) +getConfusionMatrix(preds) ``` -| Metric | Function | Description | -|--------|----------|-------------| -| nMCC | `calculatenMCC()` | Normalized Matthews correlation coefficient (0-1) | -| F1 | `calculateF1()` | Harmonic mean of precision and recall | -| Balanced accuracy | `calculateBalAcc()` | Average of sensitivity and specificity | -| AUPRC | `calculateAUPRC()` | Area under precision-recall curve | -| log2(AUPRC/prior) | `calculateLog2APOP()` | Class-imbalance adjusted AUPRC | - ## Feature importance +`extractTopFeats()` ranks features by absolute coefficient (for LR). Use `n_top_feats` for a fixed count or `prop_vi_top_feats` for a percentile range. + ```{r feature-importance} -# Extract top features from fitted model by number top_features <- extractTopFeats(fit, n_top_feats = 20) +head(top_features) +``` -# Or by proportion of variable importance -top_features <- extractTopFeats(fit, prop_vi_top_feats = c(0, 0.2)) +## Visualization -# Columns: Variable, Importance, Sign -# Positive Sign = associated with resistance -# Negative Sign = associated with susceptibility +```{r plot-prc, fig.width = 5, fig.height = 4} +plotPRC(results$pred) ``` -## Iterative feature elimination (IFE) +```{r plot-vi, fig.width = 5, fig.height = 4} +plotTopFeatsVI(results$fit, n_top_feats = 10) +``` -IFE iteratively removes top features to identify the most predictive subset: +For a baseline comparison against random labels, fit a shuffled-label pipeline and compare: -```{r ife} -ife_results <- runIFE( - ml_tibble, - by_num = TRUE, # Remove by number of features - by_vi = FALSE, # Or remove by variable importance - percent_removal_vec = 10 * 1:9, # Remove 10%, 20%, ..., 90% - mix_vec = 0, # L2 regularization for IFE - return_feats = TRUE, - verbose = TRUE +```{r plot-baseline, eval = FALSE} +shuffled <- runMLPipeline( + ml_input_tibble = ml_tibble, + model = "LR", + split = c(1, 0), + n_fold = 2, + shuffle_labels = TRUE, + return_pred = TRUE ) -# Results include nMCC at each iteration -ife_results$ife_performance_tibble -ife_results$feats_removed - -# Remove specific features manually -ml_tibble_reduced <- removeTopFeats(ml_tibble, top_features) +plotBaselineComparison( + non_shuffled_label_results = results$performance_tibble, + shuffled_label_results = shuffled$performance_tibble +) ``` -## Visualization - -### Precision-recall curve +## Iterative feature elimination -```{r plot-prc} -test_data_plus_predictions <- readr::read_tsv(results / ML_pred / Sfl_drug_AMP_domains_binary_prediction.tsv) -plotPRC(test_data_plus_predictions) -``` -### ROC curve +`runIFE()` retrains the model after iteratively removing top-ranked features, helping identify the minimal predictive subset. It runs the pipeline once per percentile in `percent_removal_vec` -```{r plot-roc} -test_data_plus_predictions <- readr::read_tsv(results / ML_pred / Sfl_drug_AMP_domains_binary_prediction.tsv) -plotROC(test_data_plus_predictions) -``` -### Variable importance plot +```{r ife, eval = FALSE} +ife_results <- runIFE( + ml_tibble, + by_num = TRUE, + by_vi = FALSE, + percent_removal_vec = 10 * 1:9, + mix_vec = 0, + return_feats = TRUE, + verbose = FALSE +) -```{r plot-vi} -topfeat <- readr::read_tsv(results / ML_top_features / Sfl_drug_AMP_domains_binary_top_features.tsv) -plotTopFeatsVI(topfeat) +ife_results$ife_performance_tibble +ife_results$feats_removed ``` -### Baseline comparison barplot -non_shuffled_label_results Output of `runMLPipeline(shuffle_labels = FALSE)` -shuffled_label_results Output of `runMLPipeline(shuffle_labels = TRUE)` -```{r plot-baseline} -# Run with shuffled labels for baseline -baseline_results <- runMLPipeline( - ml_input_tibble = ml_tibble, - model = "LR", - split = c(0.6, 0.2), - shuffle_labels = TRUE -) +`removeTopFeats()` strips a given set of features from a matrix tibble if you want to do this manually: -# Compare performance -getBaselineComparisonBarplot( - non_shuffled_label_results = results$performance_tibble, - shuffled_label_results = baseline_results$performance_tibble -) +```{r remove-feats} +trimmed <- removeTopFeats(ml_tibble, head(top_features, 5)) +ncol(ml_tibble) - ncol(trimmed) ``` -## Fisher's exact tests (baseline comparison) +## Fisher's exact tests (non-ML baseline) -As a non-ML baseline, run Fisher's exact tests with multiple testing correction: +`runFishers()` runs a Fisher's exact test of feature presence vs. phenotype for each feature, applies Benjamini–Hochberg correction, and computes per-class frequencies. ```{r fisher} -# Complete Fisher pipeline fisher_results <- runFishers( - matrix_path = "results/matrix/Cje_drug_CIP_genes_binary_sparse.parquet", - Q = 0.05, - alternative = "two.sided", + matrix_path = matrix_path, + Q = 0.05, + alternative = "two.sided", susceptible_label = "Susceptible", - resistant_label = "Resistant" + resistant_label = "Resistant" ) -# Or step-by-step: -encoded <- encodePhenotype(ml_tibble) -fisher_res <- runFisherTests(encoded$df, encoded$target, alternative = "two.sided") -fisher_res <- applyBenjaminiHochberg(fisher_res, Q = 0.05) -fisher_res <- computeFeatureFreq(encoded$df, fisher_res, encoded$target) +head(fisher_results) ``` -### Plot Fisher's results -You can label the top N features to highlight the strongest hits (default is 5) or change the BH-adjusted p-value threshold with the alpha argument - -```{r} -plotFishers(fisher_results) -plotFishers(fisher_results, alpha = 0.01, label_top_n = 5) +```{r plot-fishers, fig.width = 5, fig.height = 4} +plotFishers(fisher_results, alpha = 0.05, label_top_n = 5) ``` -## Wrapper to run all models -Given a DuckDB file produced by `runDataProcessing()`, it: -1. generates all ML feature matrices (drug, class, year, country, MDR, LOO) -2. creates all ML directory structures -3. prepares ML input lists for every mode -4. runs logistic regression ML models (standard + stratified + MDR) -5. saves performance metrics, fitted models, predictions, and top feature rankings -``` {r} -runModelingPipeline(parquet_duckdb_path, - threads = 16, - n_fold = 5, - split = c(1, 0), - min_n = 25, - prop_vi_top_feats = c(0, 1), - pca_threshold = 0.99, - verbose = TRUE, +## Training all models with `runMLmodels` + +`runMLmodels()` trains a model on every matrix produced by `generateMLInputs()` and writes performance TSVs into `out_path/ML_performance/`, predictions into `ML_pred/`, and top features into `ML_top_features/`. Takes over an hour on Sfl with default settings. + +```{r run-all-models, eval = FALSE} +runMLmodels( + path = out_dir, + stratify_by = NULL, + LOO = FALSE, + cross_test = FALSE, + threads = max(1L, parallel::detectCores() - 1L), + split = c(1, 0), + n_fold = 5, + verbose = TRUE, + return_pred = TRUE, use_saved_split = TRUE ) ``` -Merge the performance and top features of each kind of models into a parquet that will serve as starting data for `amRshiny` package - -``` {r} -buildPerformancePq( - path = "results/ML_perf", - stratify_by = NULL, - LOO = FALSE, - MDR = FALSE, - cross_test = FALSE, - out_parquet = NULL, - compression = "zstd", - verbose = TRUE -) -buildTopFeatsPq( - path = "results/ML_top_features", - stratify_by = NULL, - LOO = FALSE, - MDR = FALSE, - cross_test = FALSE, - out_parquet = NULL, - compression = "zstd", - verbose = TRUE +## End-to-end: `runModelingPipeline` + +For the full pipeline from a DuckDB to all outputs in one call: + +```{r run-modeling-pipeline, eval = FALSE} +runModelingPipeline( + parquet_duckdb_path = fixture, + threads = max(1L, parallel::detectCores() - 1L), + n_fold = 5, + split = c(1, 0), + min_n = 25, + prop_vi_top_feats = c(0, 1), + pca_threshold = 0.99, + verbose = TRUE, + use_saved_split = TRUE ) ``` + +## Session info + +```{r session-info} +sessionInfo() +``` From 32d39956b4a86a64909dac2f299ae29949fe7a17 Mon Sep 17 00:00:00 2001 From: amcim Date: Wed, 27 May 2026 23:04:29 +0000 Subject: [PATCH 2/6] Style code (GHA) --- R/core_ml.R | 42 +++++++++++++++++++++++++--------------- R/generate_matrices_ml.R | 4 +++- R/ife_ml.R | 3 ++- R/plot_ml.R | 23 ++++++++++++++-------- R/prep_ml.R | 9 ++++++--- R/run_Fisher_tests.R | 18 +++++++++++------ vignettes/intro.Rmd | 10 +++++----- 7 files changed, 69 insertions(+), 40 deletions(-) diff --git a/R/core_ml.R b/R/core_ml.R index 37f04cd..39ffd21 100644 --- a/R/core_ml.R +++ b/R/core_ml.R @@ -214,7 +214,7 @@ buildLRModel <- function(multi_class = FALSE) { #' feat_b = rep(c(1L, 0L), 5) #' ) #' rec <- buildRecipe(train, use_pca = FALSE) -#' lr <- buildLRModel() +#' lr <- buildLRModel() #' buildWflow(lr, rec) #' @export buildWflow <- function(parsnip_mod, recipe) { @@ -282,9 +282,11 @@ buildTuningGrid <- function( #' @examples #' data(demo_ml_tibble) #' data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) -#' wflow <- buildWflow(buildLRModel(), -#' buildRecipe(rsample::training(data_split))) -#' grid <- buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)) +#' wflow <- buildWflow( +#' buildLRModel(), +#' buildRecipe(rsample::training(data_split)) +#' ) +#' grid <- buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)) #' set.seed(1) #' tuneGrid(wflow, data_split, grid, n_fold = 2) #' @export @@ -339,12 +341,15 @@ tuneGrid <- function(wflow, data_split, grid = buildTuningGrid(model = "LR"), #' @examples #' data(demo_ml_tibble) #' data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) -#' wflow <- buildWflow(buildLRModel(), -#' buildRecipe(rsample::training(data_split))) +#' wflow <- buildWflow( +#' buildLRModel(), +#' buildRecipe(rsample::training(data_split)) +#' ) #' set.seed(1) #' tune_res <- tuneGrid(wflow, data_split, -#' buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)), -#' n_fold = 2) +#' buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)), +#' n_fold = 2 +#' ) #' selectBestModel(tune_res, wflow, "mcc") #' @export selectBestModel <- function(tune_res, wflow, select_best_metric = "mcc") { @@ -373,9 +378,10 @@ selectBestModel <- function(tune_res, wflow, select_best_metric = "mcc") { #' train <- rsample::training(data_split) #' wflow <- buildWflow(buildLRModel(), buildRecipe(train)) #' set.seed(1) -#' tune_res <- tuneGrid(wflow, data_split, -#' buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)), -#' n_fold = 2) +#' tune_res <- tuneGrid(wflow, data_split, +#' buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)), +#' n_fold = 2 +#' ) #' best_wflow <- selectBestModel(tune_res, wflow, "mcc") #' fitBestModel(best_wflow, train) #' @export @@ -455,8 +461,10 @@ predictML <- function(fit, test_data) { #' levels = c("Resistant", "Susceptible") #' ), #' .pred_class = factor( -#' c("Resistant", "Resistant", "Susceptible", "Resistant", -#' "Susceptible", "Resistant", "Susceptible", "Susceptible"), +#' c( +#' "Resistant", "Resistant", "Susceptible", "Resistant", +#' "Susceptible", "Resistant", "Susceptible", "Susceptible" +#' ), #' levels = c("Resistant", "Susceptible") #' ) #' ) @@ -703,11 +711,13 @@ getConfusionMatrix <- function(test_data_plus_predictions) { #' levels = c("Resistant", "Susceptible") #' ), #' .pred_class = factor( -#' c("Resistant", "Resistant", "Susceptible", "Resistant", "Susceptible", -#' "Susceptible", "Resistant", "Susceptible", "Susceptible", "Resistant"), +#' c( +#' "Resistant", "Resistant", "Susceptible", "Resistant", "Susceptible", +#' "Susceptible", "Resistant", "Susceptible", "Susceptible", "Resistant" +#' ), #' levels = c("Resistant", "Susceptible") #' ), -#' .pred_Resistant = c(0.9, 0.8, 0.4, 0.7, 0.3, 0.2, 0.6, 0.1, 0.2, 0.55), +#' .pred_Resistant = c(0.9, 0.8, 0.4, 0.7, 0.3, 0.2, 0.6, 0.1, 0.2, 0.55), #' .pred_Susceptible = c(0.1, 0.2, 0.6, 0.3, 0.7, 0.8, 0.4, 0.9, 0.8, 0.45) #' ) #' calculateEvalMets(preds) diff --git a/R/generate_matrices_ml.R b/R/generate_matrices_ml.R index 69a6a87..978edb0 100644 --- a/R/generate_matrices_ml.R +++ b/R/generate_matrices_ml.R @@ -677,7 +677,9 @@ skipImbalancedMatrix <- function(genome_ids, arrow::write_parquet(combined, out_file) log("debug", paste0("Created LOO file: ", out_file)) out_file - }) |> purrr::compact() |> unlist() + }) |> + purrr::compact() |> + unlist() }) |> unlist() created <- c(created, new_files) diff --git a/R/ife_ml.R b/R/ife_ml.R index cd12268..7358640 100644 --- a/R/ife_ml.R +++ b/R/ife_ml.R @@ -74,7 +74,8 @@ removeTopFeats <- function(ml_input_tibble, top_feat_tibble) { #' data(demo_ml_tibble) #' set.seed(1) #' runIFE( -#' demo_ml_tibble, by_num = TRUE, by_vi = FALSE, +#' demo_ml_tibble, +#' by_num = TRUE, by_vi = FALSE, #' percent_removal_vec = c(25, 50), mix_vec = 0, #' return_feats = TRUE, verbose = FALSE #' ) diff --git a/R/plot_ml.R b/R/plot_ml.R index 84aea3d..c99886c 100644 --- a/R/plot_ml.R +++ b/R/plot_ml.R @@ -38,11 +38,13 @@ NULL #' levels = c("Resistant", "Susceptible") #' ), #' .pred_class = factor( -#' c("Resistant", "Resistant", "Susceptible", "Resistant", "Susceptible", -#' "Susceptible", "Resistant", "Susceptible", "Susceptible", "Resistant"), +#' c( +#' "Resistant", "Resistant", "Susceptible", "Resistant", "Susceptible", +#' "Susceptible", "Resistant", "Susceptible", "Susceptible", "Resistant" +#' ), #' levels = c("Resistant", "Susceptible") #' ), -#' .pred_Resistant = c(0.9, 0.8, 0.4, 0.7, 0.3, 0.2, 0.6, 0.1, 0.2, 0.55), +#' .pred_Resistant = c(0.9, 0.8, 0.4, 0.7, 0.3, 0.2, 0.6, 0.1, 0.2, 0.55), #' .pred_Susceptible = c(0.1, 0.2, 0.6, 0.3, 0.7, 0.8, 0.4, 0.9, 0.8, 0.45) #' ) #' plotPRC(preds) @@ -109,8 +111,10 @@ plotTopFeatsVI <- function(fit, n_top_feats = 10) { #' model = rep(c("LR", "RF"), each = 3), #' avg_f1_score = c(0.72, 0.78, 0.83, 0.70, 0.75, 0.80) #' ) -#' plotDefaultEval(default_eval, x_default_eval = "train_prop", -#' y_default_eval = "avg_f1_score") +#' plotDefaultEval(default_eval, +#' x_default_eval = "train_prop", +#' y_default_eval = "avg_f1_score" +#' ) #' @export plotDefaultEval <- function( default_eval_tibble, x_default_eval = "train_prop", @@ -241,10 +245,13 @@ plotBaselineComparison <- function( #' long <- tibble::tibble( #' genome_id = rep(paste0("g", 1:10), each = 2), #' feature_id = rep(c("gene_a", "gene_b"), 10), -#' value = c(1, 0, 1, 0, 1, 1, 1, 1, 0, 1, -#' 0, 0, 0, 1, 0, 1, 0, 1, 0, 0), +#' value = c( +#' 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, +#' 0, 0, 0, 1, 0, 1, 0, 1, 0, 0 +#' ), #' genome_drug.resistant_phenotype = rep( -#' rep(c("Resistant", "Susceptible"), each = 5), each = 2 +#' rep(c("Resistant", "Susceptible"), each = 5), +#' each = 2 #' ) #' ) #' tmp <- tempfile(fileext = ".parquet") diff --git a/R/prep_ml.R b/R/prep_ml.R index cbacc91..ef3bbff 100644 --- a/R/prep_ml.R +++ b/R/prep_ml.R @@ -79,7 +79,8 @@ NULL #' feature_id = rep(c("gene_a", "gene_b"), 6), #' value = c(1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1), #' genome_drug.resistant_phenotype = rep( -#' rep(c("Resistant", "Susceptible"), each = 3), each = 2 +#' rep(c("Resistant", "Susceptible"), each = 3), +#' each = 2 #' ) #' ) #' tmp <- tempfile(fileext = ".parquet") @@ -244,8 +245,10 @@ calculateMinSamples <- function(n_fold, split, res_prop, smallest_n_obs_rs = 1) #' @examples #' ml <- tibble::tibble( #' genome_id = paste0("g", 1:4), -#' genome_drug.resistant_phenotype = c("Resistant", "Susceptible", -#' "Resistant", "Susceptible"), +#' genome_drug.resistant_phenotype = c( +#' "Resistant", "Susceptible", +#' "Resistant", "Susceptible" +#' ), #' feat_a = c(1L, 0L, 1L, 0L) #' ) #' .getTargetVarName(ml) diff --git a/R/run_Fisher_tests.R b/R/run_Fisher_tests.R index abaf42c..28b6ea0 100644 --- a/R/run_Fisher_tests.R +++ b/R/run_Fisher_tests.R @@ -27,8 +27,10 @@ NULL #' @examples #' df <- tibble::tibble( #' genome_id = paste0("g", 1:6), -#' genome_drug.resistant_phenotype = c("Resistant", "Susceptible", "Resistant", -#' "Susceptible", "Resistant", "Susceptible"), +#' genome_drug.resistant_phenotype = c( +#' "Resistant", "Susceptible", "Resistant", +#' "Susceptible", "Resistant", "Susceptible" +#' ), #' gene_a = c(1L, 0L, 1L, 0L, 1L, 0L), #' gene_b = c(0L, 1L, 1L, 0L, 0L, 1L) #' ) @@ -136,7 +138,8 @@ applyBenjaminiHochberg <- function(df_fisher, Q = 0.05) { #' ) #' encoded <- encodePhenotype(df) #' fisher <- applyBenjaminiHochberg( -#' runFisherTests(encoded$df, encoded$target), Q = 0.05 +#' runFisherTests(encoded$df, encoded$target), +#' Q = 0.05 #' ) #' computeFeatureFreq(encoded$df, fisher, encoded$target) #' @export @@ -176,10 +179,13 @@ computeFeatureFreq <- function(df, df_fisher, target) { #' long <- tibble::tibble( #' genome_id = rep(paste0("g", 1:10), each = 2), #' feature_id = rep(c("gene_a", "gene_b"), 10), -#' value = c(1, 0, 1, 0, 1, 1, 1, 1, 0, 1, -#' 0, 0, 0, 1, 0, 1, 0, 1, 0, 0), +#' value = c( +#' 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, +#' 0, 0, 0, 1, 0, 1, 0, 1, 0, 0 +#' ), #' genome_drug.resistant_phenotype = rep( -#' rep(c("Resistant", "Susceptible"), each = 5), each = 2 +#' rep(c("Resistant", "Susceptible"), each = 5), +#' each = 2 #' ) #' ) #' tmp <- tempfile(fileext = ".parquet") diff --git a/vignettes/intro.Rmd b/vignettes/intro.Rmd index 89f3b18..be27d7a 100644 --- a/vignettes/intro.Rmd +++ b/vignettes/intro.Rmd @@ -80,7 +80,7 @@ matrix_path <- file.path( out_dir, "matrix", "Sfl_drug_AMP_genes_binary_sparse.parquet" ) -ml_tibble <- loadMLInputTibble(matrix_path) +ml_tibble <- loadMLInputTibble(matrix_path) n_features <- getNumFeat(ml_tibble) target_var <- .getTargetVarName(ml_tibble) @@ -148,13 +148,13 @@ The builders below are what `runMLPipeline()` chains internally. Call them direc ```{r split-data} data_split <- splitMLInputTibble(ml_tibble, split = c(0.6, 0.2), seed = 123) train_data <- rsample::training(data_split) -test_data <- rsample::testing(data_split) +test_data <- rsample::testing(data_split) ``` ```{r build-model} recipe <- buildRecipe(train_data, use_pca = FALSE) lr_mod <- buildLRModel(multi_class = FALSE) -wflow <- buildWflow(lr_mod, recipe) +wflow <- buildWflow(lr_mod, recipe) grid <- buildTuningGrid( model = "LR", @@ -164,9 +164,9 @@ grid <- buildTuningGrid( ``` ```{r tune-fit} -tune_res <- tuneGrid(wflow, data_split, grid, n_fold = 2) +tune_res <- tuneGrid(wflow, data_split, grid, n_fold = 2) best_wflow <- selectBestModel(tune_res, wflow, select_best_metric = "mcc") -fit <- fitBestModel(best_wflow, train_data) +fit <- fitBestModel(best_wflow, train_data) preds <- predictML(fit, test_data) ``` From 07d59b1bcc6d13974e38f01cf827b62b577da687 Mon Sep 17 00:00:00 2001 From: Alexander McKim Date: Wed, 27 May 2026 17:37:47 -0600 Subject: [PATCH 3/6] adding BiocStyle to DESCRIPTION --- DESCRIPTION | 20 +++++++++++++------- NAMESPACE | 1 + 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5802afc..7956cdb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,10 +21,10 @@ Description: Comprehensive machine learning (ML) pipeline for predicting antimic License: BSD_3_clause + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.3 Depends: R (>= 4.5.0) Suggests: + BiocStyle, ComplexHeatmap, knitr, rmarkdown, @@ -34,17 +34,17 @@ VignetteBuilder: knitr Config/testthat/edition: 3 Imports: arrow, + BiocParallel, DBI, dplyr, duckdb, - future, - future.apply, ggplot2, ggrepel, glmnet, glue, jsonlite, hardhat, + methods, parsnip, purrr, readr, @@ -57,14 +57,20 @@ Imports: tidyr, tune, vip, + withr, workflows, workflowsets, yardstick biocViews: - ML, - AMR, - MicrobialGenomics, - Pathogen, + Software, + Classification, + Regression, + StatisticalMethod, + FeatureExtraction, + MultipleComparison, + FunctionalGenomics, + Genetics, Visualization URL: https://github.com/JRaviLab/amRml BugReports: https://github.com/JRaviLab/amRml/issues +Config/roxygen2/version: 8.0.0 diff --git a/NAMESPACE b/NAMESPACE index 1c074a6..0b9d60e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -91,6 +91,7 @@ importFrom(graphics,barplot) importFrom(hardhat,tune) importFrom(jsonlite,fromJSON) importFrom(jsonlite,write_json) +importFrom(methods,is) importFrom(parsnip,augment) importFrom(parsnip,boost_tree) importFrom(parsnip,extract_fit_engine) From d3c06ed29e988314b6786c25cc6bb325c7e55dd2 Mon Sep 17 00:00:00 2001 From: Alexander McKim Date: Wed, 27 May 2026 17:50:11 -0600 Subject: [PATCH 4/6] fixing bug involving small fixture rng --- tests/testthat/test-pipeline.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-pipeline.R b/tests/testthat/test-pipeline.R index 6bb0043..8333942 100644 --- a/tests/testthat/test-pipeline.R +++ b/tests/testthat/test-pipeline.R @@ -19,6 +19,7 @@ minimal_grid_args <- list( ) run_pipeline <- function(fx, split = c(1, 0), n_fold = 2, ...) { + set.seed(1) suppressMessages(suppressWarnings( runMLPipeline( fx, From 08ecd45eceeb0d7e9180fcf9f160ee9f8015bb2c Mon Sep 17 00:00:00 2001 From: Alexander McKim Date: Fri, 29 May 2026 12:24:59 -0600 Subject: [PATCH 5/6] putting seed in runMLPipeline --- R/core_ml.R | 48 +++++++++++++++++++------------------ R/run_ML.R | 12 ++++++---- R/run_ml_pipeline.R | 6 ++++- man/buildWflow.Rd | 2 +- man/calculateEvalMets.Rd | 8 ++++--- man/computeFeatureFreq.Rd | 3 ++- man/demo_fit.Rd | 7 +++--- man/demo_ml_tibble.Rd | 10 ++++---- man/dot-getTargetVarName.Rd | 6 +++-- man/encodePhenotype.Rd | 6 +++-- man/fitBestModel.Rd | 7 +++--- man/getConfusionMatrix.Rd | 6 +++-- man/loadMLInputTibble.Rd | 3 ++- man/plotDefaultEval.Rd | 6 +++-- man/plotFishers.Rd | 9 ++++--- man/plotPRC.Rd | 8 ++++--- man/runFishers.Rd | 9 ++++--- man/runIFE.Rd | 3 ++- man/runMDRmodels.Rd | 5 +++- man/runMLmodels.Rd | 5 +++- man/selectBestModel.Rd | 11 +++++---- man/splitMLInputTibble.Rd | 7 ++++-- man/tuneGrid.Rd | 8 ++++--- 23 files changed, 120 insertions(+), 75 deletions(-) diff --git a/R/core_ml.R b/R/core_ml.R index 39ffd21..21309f1 100644 --- a/R/core_ml.R +++ b/R/core_ml.R @@ -70,7 +70,10 @@ NULL #' genome is resistant), but not both. #' @param split [num] Vector of length 2 indicating the proportion of data to #' be designated as training and validation, respectively. -#' @param seed [num] For reproducible analysis +#' @param seed [num] Optional. If supplied, the split is seeded (and the +#' caller's RNG state restored afterward) for standalone reproducibility. When +#' `NULL` (the default, as used by `runMLPipeline()`), the split inherits the +#' ambient RNG stream so it can share one seed with downstream tuning and fitting. #' @return An `rsplit` object #' @examples #' ml <- tibble::tibble( @@ -81,34 +84,33 @@ NULL #' ) #' splitMLInputTibble(ml, split = c(1, 0), seed = 42) #' @export -splitMLInputTibble <- function(ml_input_tibble, split = c(0.6, 0.2), seed = 5280) { +splitMLInputTibble <- function(ml_input_tibble, split = c(0.6, 0.2), seed = NULL) { .checkArgTibble(ml_input_tibble, ml = TRUE) .checkArgSplit(split) - .checkArgSeed(seed) + if (!is.null(seed)) { + .checkArgSeed(seed) + withr::local_seed(seed) + } target_var <- .getTargetVarName(ml_input_tibble) # Split the data, maintaining R/S proportions. - data_split <- withr::with_seed(seed, { - if (split[2] == 0) { - # If in CV mode: - # Still retain a stratified testing holdout purely for final reporting metrics; - # CV is only performed on the training portion. - prop_train_for_holdout <- 0.8 # 80 percent train, 20 percent reserved test - rsample::initial_split( - ml_input_tibble, - prop = prop_train_for_holdout, - strata = !!target_var - ) - } else { - rsample::initial_validation_split( - ml_input_tibble, - prop = split, - strata = !!target_var - ) - } - }) - + if (split[2] == 0) { + # CV mode: retain a stratified testing holdout purely for final reporting + # metrics; CV is only performed on the training portion. + prop_train_for_holdout <- 0.8 # 80 percent train, 20 percent reserved test + data_split <- rsample::initial_split( + ml_input_tibble, + prop = prop_train_for_holdout, + strata = !!target_var + ) + } else { + data_split <-rsample::initial_validation_split( + ml_input_tibble, + prop = split, + strata = !!target_var + ) + } return(data_split) } diff --git a/R/run_ML.R b/R/run_ML.R index 06caff2..e85d42e 100644 --- a/R/run_ML.R +++ b/R/run_ML.R @@ -542,6 +542,7 @@ createMLinputList <- function(path, #' @param use_saved_split Logical. Whether to inherit split/seed/n_fold from ml_parameters.json. Default TRUE. #' @param shuffle_labels Logical. Randomly shuffle labels for baseline runs. Default FALSE. #' @param use_pca Logical. Use PCA on predictors. Default FALSE. +#' @param seed Integer. Seed for the parallel RNG streams (`BiocParallel`). Default 5280. #' #' @return NULL (invisible). Called for side effects (model training and result saving). #' @@ -580,7 +581,8 @@ runMDRmodels <- function(path, return_pred = TRUE, use_saved_split = TRUE, shuffle_labels = FALSE, - use_pca = FALSE) { + use_pca = FALSE, + seed = 5280) { files <- createMLinputList(path, stratify_by = NULL, LOO = FALSE, @@ -593,7 +595,7 @@ runMDRmodels <- function(path, return(invisible(NULL)) } - param <- BiocParallel::SnowParam(workers = threads, type = "SOCK", RNGseed = 42) + param <- BiocParallel::SnowParam(workers = threads, type = "SOCK", RNGseed = seed) if (isTRUE(verbose)) { message("runMDRmodels(): enabling SnowParam with workers = ", threads) @@ -724,6 +726,7 @@ runMDRmodels <- function(path, #' @param use_saved_split Logical. Whether to inherit split/seed/n_fold from ml_parameters.json. Default TRUE. #' @param shuffle_labels Logical. Randomly shuffle labels for baseline runs. Default FALSE. #' @param use_pca Logical. Use PCA on predictors. Default FALSE. +#' @param seed Integer. Seed for the parallel RNG streams (`BiocParallel`). Default 5280. #' #' @return NULL (invisible). Called for side effects (model training and result saving). #' @@ -843,7 +846,8 @@ runMLmodels <- function(path, return_pred = TRUE, use_saved_split = TRUE, shuffle_labels = FALSE, - use_pca = FALSE) { + use_pca = FALSE, + seed = 5280) { if (!is.null(stratify_by)) { if (!is.character(stratify_by) || length(stratify_by) != 1L) { stop("`stratify_by` must be NULL or a single string: 'year' or 'country'.") @@ -865,7 +869,7 @@ runMLmodels <- function(path, return(invisible(NULL)) } - param <- BiocParallel::SnowParam(workers = threads, type = "SOCK", RNGseed = 42) + param <- BiocParallel::SnowParam(workers = threads, type = "SOCK", RNGseed = seed) if (isTRUE(verbose)) { message("runMLmodels(): enabling SnowParam with workers = ", threads) diff --git a/R/run_ml_pipeline.R b/R/run_ml_pipeline.R index 797cab0..eab8dcc 100644 --- a/R/run_ml_pipeline.R +++ b/R/run_ml_pipeline.R @@ -100,7 +100,11 @@ runMLPipeline <- function( .checkArgReturnTuneRes(return_tune_res) .checkArgReturnFit(return_fit) .checkArgReturnPred(return_pred) + .checkArgSeed(seed) + # Seed once for the whole pipeline so the split, CV folds, tuning, and fit + # share one continuous RNG stream (restored to the caller's state on exit). + withr::local_seed(seed) # Set `n_fold` to `NA` if not using cross-validation. if (split[2] != 0) { @@ -223,7 +227,7 @@ runMLPipeline <- function( dplyr::select(where(~ !any(is.na(.)))) } - data_split <- splitMLInputTibble(ml_input_tibble, split = split, seed = seed) + data_split <- splitMLInputTibble(ml_input_tibble, split = split) # Now correct `data_split` if external `test_data` is provided. if (external_test_data & split[2] != 0) { diff --git a/man/buildWflow.Rd b/man/buildWflow.Rd index e3a1a6b..5791562 100644 --- a/man/buildWflow.Rd +++ b/man/buildWflow.Rd @@ -26,6 +26,6 @@ train <- tibble::tibble( feat_b = rep(c(1L, 0L), 5) ) rec <- buildRecipe(train, use_pca = FALSE) -lr <- buildLRModel() +lr <- buildLRModel() buildWflow(lr, rec) } diff --git a/man/calculateEvalMets.Rd b/man/calculateEvalMets.Rd index c4529a9..0506645 100644 --- a/man/calculateEvalMets.Rd +++ b/man/calculateEvalMets.Rd @@ -27,11 +27,13 @@ preds <- tibble::tibble( levels = c("Resistant", "Susceptible") ), .pred_class = factor( - c("Resistant", "Resistant", "Susceptible", "Resistant", "Susceptible", - "Susceptible", "Resistant", "Susceptible", "Susceptible", "Resistant"), + c( + "Resistant", "Resistant", "Susceptible", "Resistant", "Susceptible", + "Susceptible", "Resistant", "Susceptible", "Susceptible", "Resistant" + ), levels = c("Resistant", "Susceptible") ), - .pred_Resistant = c(0.9, 0.8, 0.4, 0.7, 0.3, 0.2, 0.6, 0.1, 0.2, 0.55), + .pred_Resistant = c(0.9, 0.8, 0.4, 0.7, 0.3, 0.2, 0.6, 0.1, 0.2, 0.55), .pred_Susceptible = c(0.1, 0.2, 0.6, 0.3, 0.7, 0.8, 0.4, 0.9, 0.8, 0.45) ) calculateEvalMets(preds) diff --git a/man/computeFeatureFreq.Rd b/man/computeFeatureFreq.Rd index 3c79dd4..961a507 100644 --- a/man/computeFeatureFreq.Rd +++ b/man/computeFeatureFreq.Rd @@ -28,7 +28,8 @@ df <- tibble::tibble( ) encoded <- encodePhenotype(df) fisher <- applyBenjaminiHochberg( - runFisherTests(encoded$df, encoded$target), Q = 0.05 + runFisherTests(encoded$df, encoded$target), + Q = 0.05 ) computeFeatureFreq(encoded$df, fisher, encoded$target) } diff --git a/man/demo_fit.Rd b/man/demo_fit.Rd index b7b38db..0810296 100644 --- a/man/demo_fit.Rd +++ b/man/demo_fit.Rd @@ -3,18 +3,17 @@ \docType{data} \name{demo_fit} \alias{demo_fit} -\title{Small demo LR fit} +\title{Demo LR fit} \format{ A fitted \code{workflow} object (output of \code{\link[=fitBestModel]{fitBestModel()}}). } \source{ -Built by \code{inst/scripts/make_demo_data.R}. +\code{inst/scripts/make_demo_data.R}. } \usage{ data(demo_fit) } \description{ -A tuned logistic-regression workflow fitted on \link{demo_ml_tibble}, for use -in package examples that require a pre-fitted model. +A tuned logistic-regression workflow fitted on \link{demo_ml_tibble}. } \keyword{datasets} diff --git a/man/demo_ml_tibble.Rd b/man/demo_ml_tibble.Rd index 4a02e3e..021834d 100644 --- a/man/demo_ml_tibble.Rd +++ b/man/demo_ml_tibble.Rd @@ -3,20 +3,20 @@ \docType{data} \name{demo_ml_tibble} \alias{demo_ml_tibble} -\title{Small demo ML input tibble} +\title{Demo ML input tibble} \format{ A tibble with 60 rows and 82 columns: \code{genome_id}, \code{genome_drug.resistant_phenotype}, and 80 binary feature columns. } \source{ -Built by \code{inst/scripts/make_demo_data.R}. +\code{inst/scripts/make_demo_data.R}. } \usage{ data(demo_ml_tibble) } \description{ -A subset of the AMP-genes-binary matrix generated from the bundled -\code{Sfl_parquet.duckdb} fixture, trimmed to 60 rows (30 Resistant, 30 -Susceptible) and ~80 feature columns for use in package examples. +Stratified subset (30 Resistant + 30 Susceptible) of the AMP-genes-binary +matrix from the bundled \code{Sfl_parquet.duckdb}, restricted to 80 feature +columns. } \keyword{datasets} diff --git a/man/dot-getTargetVarName.Rd b/man/dot-getTargetVarName.Rd index 641ca8d..f9bd512 100644 --- a/man/dot-getTargetVarName.Rd +++ b/man/dot-getTargetVarName.Rd @@ -26,8 +26,10 @@ either \code{genome_drug.resistant_phenotype} or \code{resistant_classes} \examples{ ml <- tibble::tibble( genome_id = paste0("g", 1:4), - genome_drug.resistant_phenotype = c("Resistant", "Susceptible", - "Resistant", "Susceptible"), + genome_drug.resistant_phenotype = c( + "Resistant", "Susceptible", + "Resistant", "Susceptible" + ), feat_a = c(1L, 0L, 1L, 0L) ) .getTargetVarName(ml) diff --git a/man/encodePhenotype.Rd b/man/encodePhenotype.Rd index 14993c2..37dc92b 100644 --- a/man/encodePhenotype.Rd +++ b/man/encodePhenotype.Rd @@ -30,8 +30,10 @@ encodePhenotype \examples{ df <- tibble::tibble( genome_id = paste0("g", 1:6), - genome_drug.resistant_phenotype = c("Resistant", "Susceptible", "Resistant", - "Susceptible", "Resistant", "Susceptible"), + genome_drug.resistant_phenotype = c( + "Resistant", "Susceptible", "Resistant", + "Susceptible", "Resistant", "Susceptible" + ), gene_a = c(1L, 0L, 1L, 0L, 1L, 0L), gene_b = c(0L, 1L, 1L, 0L, 0L, 1L) ) diff --git a/man/fitBestModel.Rd b/man/fitBestModel.Rd index cdc7c58..70a95f1 100644 --- a/man/fitBestModel.Rd +++ b/man/fitBestModel.Rd @@ -25,9 +25,10 @@ data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) train <- rsample::training(data_split) wflow <- buildWflow(buildLRModel(), buildRecipe(train)) set.seed(1) -tune_res <- tuneGrid(wflow, data_split, - buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)), - n_fold = 2) +tune_res <- tuneGrid(wflow, data_split, + buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)), + n_fold = 2 +) best_wflow <- selectBestModel(tune_res, wflow, "mcc") fitBestModel(best_wflow, train) } diff --git a/man/getConfusionMatrix.Rd b/man/getConfusionMatrix.Rd index 1047fba..45af5de 100644 --- a/man/getConfusionMatrix.Rd +++ b/man/getConfusionMatrix.Rd @@ -25,8 +25,10 @@ preds <- tibble::tibble( levels = c("Resistant", "Susceptible") ), .pred_class = factor( - c("Resistant", "Resistant", "Susceptible", "Resistant", - "Susceptible", "Resistant", "Susceptible", "Susceptible"), + c( + "Resistant", "Resistant", "Susceptible", "Resistant", + "Susceptible", "Resistant", "Susceptible", "Susceptible" + ), levels = c("Resistant", "Susceptible") ) ) diff --git a/man/loadMLInputTibble.Rd b/man/loadMLInputTibble.Rd index 8e3310b..bfcfcf4 100644 --- a/man/loadMLInputTibble.Rd +++ b/man/loadMLInputTibble.Rd @@ -24,7 +24,8 @@ long <- tibble::tibble( feature_id = rep(c("gene_a", "gene_b"), 6), value = c(1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1), genome_drug.resistant_phenotype = rep( - rep(c("Resistant", "Susceptible"), each = 3), each = 2 + rep(c("Resistant", "Susceptible"), each = 3), + each = 2 ) ) tmp <- tempfile(fileext = ".parquet") diff --git a/man/plotDefaultEval.Rd b/man/plotDefaultEval.Rd index 9eb41ad..ca2dcfe 100644 --- a/man/plotDefaultEval.Rd +++ b/man/plotDefaultEval.Rd @@ -41,6 +41,8 @@ default_eval <- tibble::tibble( model = rep(c("LR", "RF"), each = 3), avg_f1_score = c(0.72, 0.78, 0.83, 0.70, 0.75, 0.80) ) -plotDefaultEval(default_eval, x_default_eval = "train_prop", - y_default_eval = "avg_f1_score") +plotDefaultEval(default_eval, + x_default_eval = "train_prop", + y_default_eval = "avg_f1_score" +) } diff --git a/man/plotFishers.Rd b/man/plotFishers.Rd index 53a326f..fa4729e 100644 --- a/man/plotFishers.Rd +++ b/man/plotFishers.Rd @@ -38,10 +38,13 @@ Labels are applied only to the top-ranked features to preserve clarity. long <- tibble::tibble( genome_id = rep(paste0("g", 1:10), each = 2), feature_id = rep(c("gene_a", "gene_b"), 10), - value = c(1, 0, 1, 0, 1, 1, 1, 1, 0, 1, - 0, 0, 0, 1, 0, 1, 0, 1, 0, 0), + value = c( + 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, + 0, 0, 0, 1, 0, 1, 0, 1, 0, 0 + ), genome_drug.resistant_phenotype = rep( - rep(c("Resistant", "Susceptible"), each = 5), each = 2 + rep(c("Resistant", "Susceptible"), each = 5), + each = 2 ) ) tmp <- tempfile(fileext = ".parquet") diff --git a/man/plotPRC.Rd b/man/plotPRC.Rd index 19fe869..3bad222 100644 --- a/man/plotPRC.Rd +++ b/man/plotPRC.Rd @@ -25,11 +25,13 @@ preds <- tibble::tibble( levels = c("Resistant", "Susceptible") ), .pred_class = factor( - c("Resistant", "Resistant", "Susceptible", "Resistant", "Susceptible", - "Susceptible", "Resistant", "Susceptible", "Susceptible", "Resistant"), + c( + "Resistant", "Resistant", "Susceptible", "Resistant", "Susceptible", + "Susceptible", "Resistant", "Susceptible", "Susceptible", "Resistant" + ), levels = c("Resistant", "Susceptible") ), - .pred_Resistant = c(0.9, 0.8, 0.4, 0.7, 0.3, 0.2, 0.6, 0.1, 0.2, 0.55), + .pred_Resistant = c(0.9, 0.8, 0.4, 0.7, 0.3, 0.2, 0.6, 0.1, 0.2, 0.55), .pred_Susceptible = c(0.1, 0.2, 0.6, 0.3, 0.7, 0.8, 0.4, 0.9, 0.8, 0.45) ) plotPRC(preds) diff --git a/man/runFishers.Rd b/man/runFishers.Rd index d90593c..a39b095 100644 --- a/man/runFishers.Rd +++ b/man/runFishers.Rd @@ -33,10 +33,13 @@ runFishers long <- tibble::tibble( genome_id = rep(paste0("g", 1:10), each = 2), feature_id = rep(c("gene_a", "gene_b"), 10), - value = c(1, 0, 1, 0, 1, 1, 1, 1, 0, 1, - 0, 0, 0, 1, 0, 1, 0, 1, 0, 0), + value = c( + 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, + 0, 0, 0, 1, 0, 1, 0, 1, 0, 0 + ), genome_drug.resistant_phenotype = rep( - rep(c("Resistant", "Susceptible"), each = 5), each = 2 + rep(c("Resistant", "Susceptible"), each = 5), + each = 2 ) ) tmp <- tempfile(fileext = ".parquet") diff --git a/man/runIFE.Rd b/man/runIFE.Rd index b9d2ce7..2e39c9e 100644 --- a/man/runIFE.Rd +++ b/man/runIFE.Rd @@ -55,7 +55,8 @@ returns nMCC at each iteration. data(demo_ml_tibble) set.seed(1) runIFE( - demo_ml_tibble, by_num = TRUE, by_vi = FALSE, + demo_ml_tibble, + by_num = TRUE, by_vi = FALSE, percent_removal_vec = c(25, 50), mix_vec = 0, return_feats = TRUE, verbose = FALSE ) diff --git a/man/runMDRmodels.Rd b/man/runMDRmodels.Rd index 74be1e3..73df41d 100644 --- a/man/runMDRmodels.Rd +++ b/man/runMDRmodels.Rd @@ -17,7 +17,8 @@ runMDRmodels( return_pred = TRUE, use_saved_split = TRUE, shuffle_labels = FALSE, - use_pca = FALSE + use_pca = FALSE, + seed = 5280 ) } \arguments{ @@ -46,6 +47,8 @@ runMDRmodels( \item{shuffle_labels}{Logical. Randomly shuffle labels for baseline runs. Default FALSE.} \item{use_pca}{Logical. Use PCA on predictors. Default FALSE.} + +\item{seed}{Integer. Seed for the parallel RNG streams (\code{BiocParallel}). Default 5280.} } \value{ NULL (invisible). Called for side effects (model training and result saving). diff --git a/man/runMLmodels.Rd b/man/runMLmodels.Rd index 501e8d8..2020be2 100644 --- a/man/runMLmodels.Rd +++ b/man/runMLmodels.Rd @@ -20,7 +20,8 @@ runMLmodels( return_pred = TRUE, use_saved_split = TRUE, shuffle_labels = FALSE, - use_pca = FALSE + use_pca = FALSE, + seed = 5280 ) } \arguments{ @@ -55,6 +56,8 @@ runMLmodels( \item{shuffle_labels}{Logical. Randomly shuffle labels for baseline runs. Default FALSE.} \item{use_pca}{Logical. Use PCA on predictors. Default FALSE.} + +\item{seed}{Integer. Seed for the parallel RNG streams (\code{BiocParallel}). Default 5280.} } \value{ NULL (invisible). Called for side effects (model training and result saving). diff --git a/man/selectBestModel.Rd b/man/selectBestModel.Rd index c1f6f2d..e507795 100644 --- a/man/selectBestModel.Rd +++ b/man/selectBestModel.Rd @@ -23,11 +23,14 @@ Selects best model by F1 score, AUPRC, MCC, or balanced accuracy. \examples{ data(demo_ml_tibble) data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) -wflow <- buildWflow(buildLRModel(), - buildRecipe(rsample::training(data_split))) +wflow <- buildWflow( + buildLRModel(), + buildRecipe(rsample::training(data_split)) +) set.seed(1) tune_res <- tuneGrid(wflow, data_split, - buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)), - n_fold = 2) + buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)), + n_fold = 2 +) selectBestModel(tune_res, wflow, "mcc") } diff --git a/man/splitMLInputTibble.Rd b/man/splitMLInputTibble.Rd index bfa3254..12b324b 100644 --- a/man/splitMLInputTibble.Rd +++ b/man/splitMLInputTibble.Rd @@ -4,7 +4,7 @@ \alias{splitMLInputTibble} \title{splitMLInputTibble()} \usage{ -splitMLInputTibble(ml_input_tibble, split = c(0.6, 0.2), seed = 5280) +splitMLInputTibble(ml_input_tibble, split = c(0.6, 0.2), seed = NULL) } \arguments{ \item{ml_input_tibble}{An ML-ready tibble generated by \code{loadMLInputTibble()}. @@ -17,7 +17,10 @@ genome is resistant), but not both.} \item{split}{\link[pillar:num]{num} Vector of length 2 indicating the proportion of data to be designated as training and validation, respectively.} -\item{seed}{\link[pillar:num]{num} For reproducible analysis} +\item{seed}{\link[pillar:num]{num} Optional. If supplied, the split is seeded (and the +caller's RNG state restored afterward) for standalone reproducibility. When +\code{NULL} (the default, as used by \code{runMLPipeline()}), the split inherits the +ambient RNG stream so it can share one seed with downstream tuning and fitting.} } \value{ An \code{rsplit} object diff --git a/man/tuneGrid.Rd b/man/tuneGrid.Rd index 90fc4e9..9d985a5 100644 --- a/man/tuneGrid.Rd +++ b/man/tuneGrid.Rd @@ -27,9 +27,11 @@ specified. \examples{ data(demo_ml_tibble) data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) -wflow <- buildWflow(buildLRModel(), - buildRecipe(rsample::training(data_split))) -grid <- buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)) +wflow <- buildWflow( + buildLRModel(), + buildRecipe(rsample::training(data_split)) +) +grid <- buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)) set.seed(1) tuneGrid(wflow, data_split, grid, n_fold = 2) } From f7726c730f092150fbd8e0f0138437d0d9bc4778 Mon Sep 17 00:00:00 2001 From: amcim Date: Fri, 29 May 2026 18:26:36 +0000 Subject: [PATCH 6/6] Style code (GHA) --- R/core_ml.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/core_ml.R b/R/core_ml.R index 21309f1..d805e9c 100644 --- a/R/core_ml.R +++ b/R/core_ml.R @@ -105,7 +105,7 @@ splitMLInputTibble <- function(ml_input_tibble, split = c(0.6, 0.2), seed = NULL strata = !!target_var ) } else { - data_split <-rsample::initial_validation_split( + data_split <- rsample::initial_validation_split( ml_input_tibble, prop = split, strata = !!target_var