diff --git a/DESCRIPTION b/DESCRIPTION index 5802afc..7956cdb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,10 +21,10 @@ Description: Comprehensive machine learning (ML) pipeline for predicting antimic License: BSD_3_clause + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.3 Depends: R (>= 4.5.0) Suggests: + BiocStyle, ComplexHeatmap, knitr, rmarkdown, @@ -34,17 +34,17 @@ VignetteBuilder: knitr Config/testthat/edition: 3 Imports: arrow, + BiocParallel, DBI, dplyr, duckdb, - future, - future.apply, ggplot2, ggrepel, glmnet, glue, jsonlite, hardhat, + methods, parsnip, purrr, readr, @@ -57,14 +57,20 @@ Imports: tidyr, tune, vip, + withr, workflows, workflowsets, yardstick biocViews: - ML, - AMR, - MicrobialGenomics, - Pathogen, + Software, + Classification, + Regression, + StatisticalMethod, + FeatureExtraction, + MultipleComparison, + FunctionalGenomics, + Genetics, Visualization URL: https://github.com/JRaviLab/amRml BugReports: https://github.com/JRaviLab/amRml/issues +Config/roxygen2/version: 8.0.0 diff --git a/NAMESPACE b/NAMESPACE index 1c074a6..0b9d60e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -91,6 +91,7 @@ importFrom(graphics,barplot) importFrom(hardhat,tune) importFrom(jsonlite,fromJSON) importFrom(jsonlite,write_json) +importFrom(methods,is) importFrom(parsnip,augment) importFrom(parsnip,boost_tree) importFrom(parsnip,extract_fit_engine) diff --git a/R/arg_check_ml.R b/R/arg_check_ml.R index 41f01c8..f6acf09 100644 --- a/R/arg_check_ml.R +++ b/R/arg_check_ml.R @@ -1,5 +1,6 @@ # This script contains functions to check the arguments of ML functions. +#' @importFrom methods is #' @importFrom tibble is_tibble NULL @@ -254,7 +255,7 @@ NULL #' `buildLRModel()` (random forest and boosted tree support planned) #' .checkArgParsnipMod <- function(parsnip_mod) { - if (class(parsnip_mod)[2] != "model_spec") { + if (!is(parsnip_mod, "model_spec")) { stop("A `parsnip` model was expected but not received.") } } @@ -418,7 +419,7 @@ NULL #' @param tune_res Results of grid tuning, such as the output of `tuneGrid()` #' .checkArgTuneRes <- function(tune_res) { - if (class(tune_res)[1] != "tune_results") { + if (!is(tune_res, "tune_results")) { stop("The `tune_res` argument can only take `tune_results` objects.") } } diff --git a/R/core_ml.R b/R/core_ml.R index 5846b40..d805e9c 100644 --- a/R/core_ml.R +++ b/R/core_ml.R @@ -12,6 +12,7 @@ #' @importFrom dplyr select #' @importFrom dplyr slice #' @importFrom hardhat tune +#' @importFrom methods is #' @importFrom parsnip augment #' @importFrom parsnip boost_tree #' @importFrom parsnip extract_fit_engine @@ -69,23 +70,34 @@ NULL #' genome is resistant), but not both. #' @param split [num] Vector of length 2 indicating the proportion of data to #' be designated as training and validation, respectively. -#' @param seed [num] For reproducible analysis +#' @param seed [num] Optional. If supplied, the split is seeded (and the +#' caller's RNG state restored afterward) for standalone reproducibility. When +#' `NULL` (the default, as used by `runMLPipeline()`), the split inherits the +#' ambient RNG stream so it can share one seed with downstream tuning and fitting. #' @return An `rsplit` object +#' @examples +#' ml <- tibble::tibble( +#' genome_id = paste0("g", 1:20), +#' genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 10), +#' feat_a = rep(c(0L, 1L), 10), +#' feat_b = rep(c(1L, 0L), 10) +#' ) +#' splitMLInputTibble(ml, split = c(1, 0), seed = 42) #' @export -splitMLInputTibble <- function(ml_input_tibble, split = c(0.6, 0.2), seed = 5280) { +splitMLInputTibble <- function(ml_input_tibble, split = c(0.6, 0.2), seed = NULL) { .checkArgTibble(ml_input_tibble, ml = TRUE) .checkArgSplit(split) - .checkArgSeed(seed) - - set.seed(seed) + if (!is.null(seed)) { + .checkArgSeed(seed) + withr::local_seed(seed) + } target_var <- .getTargetVarName(ml_input_tibble) # Split the data, maintaining R/S proportions. if (split[2] == 0) { - # If in CV mode: - # Still retain a stratified testing holdout purely for final reporting metrics; - # CV is only performed on the training portion. + # CV mode: retain a stratified testing holdout purely for final reporting + # metrics; CV is only performed on the training portion. prop_train_for_holdout <- 0.8 # 80 percent train, 20 percent reserved test data_split <- rsample::initial_split( ml_input_tibble, @@ -99,7 +111,6 @@ splitMLInputTibble <- function(ml_input_tibble, split = c(0.6, 0.2), seed = 5280 strata = !!target_var ) } - return(data_split) } @@ -114,6 +125,14 @@ splitMLInputTibble <- function(ml_input_tibble, split = c(0.6, 0.2), seed = 5280 #' @param pca_threshold [num] The proportion of total variance for which the #' principle components account #' @return A `recipe` object +#' @examples +#' train <- tibble::tibble( +#' genome_id = paste0("g", 1:10), +#' genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 5), +#' feat_a = rep(c(0L, 1L), 5), +#' feat_b = rep(c(1L, 0L), 5) +#' ) +#' buildRecipe(train, use_pca = FALSE) #' @export buildRecipe <- function(train_data, use_pca = FALSE, pca_threshold = 0.95) { .checkArgTibble(train_data, ml = TRUE) @@ -157,6 +176,9 @@ buildRecipe <- function(train_data, use_pca = FALSE, pca_threshold = 0.95) { #' @param multi_class [bool] Whether to construct a model for multi-class #' classification #' @return A `parsnip` `logistic_reg` object +#' @examples +#' buildLRModel() +#' buildLRModel(multi_class = TRUE) #' @export buildLRModel <- function(multi_class = FALSE) { .checkArgMultiClass(multi_class) @@ -186,6 +208,16 @@ buildLRModel <- function(multi_class = FALSE) { #' `buildLRModel()` (random forest and boosted tree support planned) #' @param recipe A recipe, such as the output of `buildRecipe()` #' @return A `workflow` object +#' @examples +#' train <- tibble::tibble( +#' genome_id = paste0("g", 1:10), +#' genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 5), +#' feat_a = rep(c(0L, 1L), 5), +#' feat_b = rep(c(1L, 0L), 5) +#' ) +#' rec <- buildRecipe(train, use_pca = FALSE) +#' lr <- buildLRModel() +#' buildWflow(lr, rec) #' @export buildWflow <- function(parsnip_mod, recipe) { .checkArgParsnipMod(parsnip_mod) @@ -210,6 +242,12 @@ buildWflow <- function(parsnip_mod, recipe) { #' regression. 0 corresponds to L2 regularization; 1 corresponds to L1; #' intermediate values (0, 1) correspond to elastic net. #' @return A logistic regression tuning grid as a tibble +#' @examples +#' buildTuningGrid( +#' model = "LR", +#' penalty_vec = 10^c(-3, -1), +#' mix_vec = c(0, 0.5, 1) +#' ) #' @export buildTuningGrid <- function( model = "LR", @@ -243,6 +281,16 @@ buildTuningGrid <- function( #' `buildTuningGrid()` #' @param n_fold [num] Number of folds of cross-validation #' @return Results of grid tuning +#' @examples +#' data(demo_ml_tibble) +#' data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +#' wflow <- buildWflow( +#' buildLRModel(), +#' buildRecipe(rsample::training(data_split)) +#' ) +#' grid <- buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)) +#' set.seed(1) +#' tuneGrid(wflow, data_split, grid, n_fold = 2) #' @export tuneGrid <- function(wflow, data_split, grid = buildTuningGrid(model = "LR"), n_fold = 5) { @@ -250,20 +298,18 @@ tuneGrid <- function(wflow, data_split, grid = buildTuningGrid(model = "LR"), .checkArgWflow(wflow) .checkArgDataSplit(data_split) - split_class <- class(data_split)[1] - # Always do CV on the training portion of the split train_df <- rsample::training(data_split) target_var <- .getTargetVarName(train_df) - if (identical(split_class, "initial_split")) { + if (is(data_split, "initial_split")) { # CV on training portion; final eval will use the held-out test set resamples <- rsample::vfold_cv(train_df, v = n_fold, strata = !!target_var) - } else if (identical(split_class, "initial_validation_split")) { + } else if (is(data_split, "initial_validation_split")) { # Use the validation partition from the original three-way split. resamples <- rsample::validation_set(data_split) } else { - stop("Unsupported rsample split object: ", split_class) + stop("Unsupported rsample split object: ", class(data_split)[1]) } tune_res <- tune::tune_grid( @@ -294,6 +340,19 @@ tuneGrid <- function(wflow, data_split, grid = buildTuningGrid(model = "LR"), #' @param select_best_metric [chr] Metric to select best model: "f_meas", #' "pr_auc", "mcc", or "bal_accuracy" #' @return Best model workflow +#' @examples +#' data(demo_ml_tibble) +#' data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +#' wflow <- buildWflow( +#' buildLRModel(), +#' buildRecipe(rsample::training(data_split)) +#' ) +#' set.seed(1) +#' tune_res <- tuneGrid(wflow, data_split, +#' buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)), +#' n_fold = 2 +#' ) +#' selectBestModel(tune_res, wflow, "mcc") #' @export selectBestModel <- function(tune_res, wflow, select_best_metric = "mcc") { .checkArgTuneRes(tune_res) @@ -315,6 +374,18 @@ selectBestModel <- function(tune_res, wflow, select_best_metric = "mcc") { #' training. This can be the output of #' `rsample::training(splitMLInputTibble(ml_input_tibble))`. #' @return Best model fit +#' @examples +#' data(demo_ml_tibble) +#' data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +#' train <- rsample::training(data_split) +#' wflow <- buildWflow(buildLRModel(), buildRecipe(train)) +#' set.seed(1) +#' tune_res <- tuneGrid(wflow, data_split, +#' buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)), +#' n_fold = 2 +#' ) +#' best_wflow <- selectBestModel(tune_res, wflow, "mcc") +#' fitBestModel(best_wflow, train) #' @export fitBestModel <- function(final_mod, train_data) { .checkArgWflow(final_mod) @@ -361,6 +432,11 @@ fitBestModel <- function(final_mod, train_data) { #' `rsample::testing(splitMLInputTibble(ml_input_tibble))`. #' @return Test data (tibble) with an added column for predicted phenotype #' labels +#' @examples +#' data(demo_ml_tibble) +#' data(demo_fit) +#' data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +#' predictML(demo_fit, rsample::testing(data_split)) #' @export predictML <- function(fit, test_data) { .checkArgWflow(fit) @@ -379,6 +455,22 @@ predictML <- function(fit, test_data) { #' @param test_data_plus_predictions Test data (tibble) with an added column for #' predicted phenotype labels, such as the output of `predictML()` #' @return Confusion matrix of class `conf_mat` +#' @examples +#' preds <- tibble::tibble( +#' genome_id = paste0("g", 1:8), +#' genome_drug.resistant_phenotype = factor( +#' rep(c("Resistant", "Susceptible"), each = 4), +#' levels = c("Resistant", "Susceptible") +#' ), +#' .pred_class = factor( +#' c( +#' "Resistant", "Resistant", "Susceptible", "Resistant", +#' "Susceptible", "Resistant", "Susceptible", "Susceptible" +#' ), +#' levels = c("Resistant", "Susceptible") +#' ) +#' ) +#' getConfusionMatrix(preds) #' @export getConfusionMatrix <- function(test_data_plus_predictions) { .checkArgTestDataPlusPredictions(test_data_plus_predictions) @@ -613,6 +705,24 @@ getConfusionMatrix <- function(test_data_plus_predictions) { #' #' @inheritParams getConfusionMatrix #' @return F1 score, AUPRC, balanced accuracy, nMCC, and log2(AUPRC/prior) +#' @examples +#' preds <- tibble::tibble( +#' genome_id = paste0("g", 1:10), +#' genome_drug.resistant_phenotype = factor( +#' rep(c("Resistant", "Susceptible"), each = 5), +#' levels = c("Resistant", "Susceptible") +#' ), +#' .pred_class = factor( +#' c( +#' "Resistant", "Resistant", "Susceptible", "Resistant", "Susceptible", +#' "Susceptible", "Resistant", "Susceptible", "Susceptible", "Resistant" +#' ), +#' levels = c("Resistant", "Susceptible") +#' ), +#' .pred_Resistant = c(0.9, 0.8, 0.4, 0.7, 0.3, 0.2, 0.6, 0.1, 0.2, 0.55), +#' .pred_Susceptible = c(0.1, 0.2, 0.6, 0.3, 0.7, 0.8, 0.4, 0.9, 0.8, 0.45) +#' ) +#' calculateEvalMets(preds) #' @export calculateEvalMets <- function(test_data_plus_predictions) { .checkArgTestDataPlusPredictions(test_data_plus_predictions) @@ -644,6 +754,9 @@ calculateEvalMets <- function(test_data_plus_predictions) { #' @return A tibble with a column for top features (`Variable`), a column for #' `Importance`, and a column for `Sign` (or, for multi-class, a tibble with #' per-class columns of importance scores for each `Variable`) +#' @examples +#' data(demo_fit) +#' extractTopFeats(demo_fit, n_top_feats = 10) #' @export extractTopFeats <- function( fit, prop_vi_top_feats = c(0, 1), @@ -691,7 +804,7 @@ extractTopFeats <- function( # Take a different approach if using multi-class (the previous code would give # a less meaningful result). - if (class(fit$fit$actions$model$spec)[1] == "multinom_reg") { + if (is(fit$fit$actions$model$spec, "multinom_reg")) { warning(paste( "Extracting top features from a multi-class model.", "The `prop_vi_top_feats` and `n_top_feats` arguments do not apply." diff --git a/R/data.R b/R/data.R new file mode 100644 index 0000000..0f06f74 --- /dev/null +++ b/R/data.R @@ -0,0 +1,18 @@ +#' Demo ML input tibble +#' +#' Stratified subset (30 Resistant + 30 Susceptible) of the AMP-genes-binary +#' matrix from the bundled `Sfl_parquet.duckdb`, restricted to 80 feature +#' columns. +#' +#' @format A tibble with 60 rows and 82 columns: `genome_id`, +#' `genome_drug.resistant_phenotype`, and 80 binary feature columns. +#' @source `inst/scripts/make_demo_data.R`. +"demo_ml_tibble" + +#' Demo LR fit +#' +#' A tuned logistic-regression workflow fitted on [demo_ml_tibble]. +#' +#' @format A fitted `workflow` object (output of [fitBestModel()]). +#' @source `inst/scripts/make_demo_data.R`. +"demo_fit" diff --git a/R/generate_matrices_ml.R b/R/generate_matrices_ml.R index 67ba14c..978edb0 100644 --- a/R/generate_matrices_ml.R +++ b/R/generate_matrices_ml.R @@ -654,10 +654,10 @@ skipImbalancedMatrix <- function(genome_ids, grouped <- split(group_df, group_df$drug_or_class) - purrr::walk(names(grouped), function(drug_class) { + new_files <- purrr::map(names(grouped), function(drug_class) { group <- grouped[[drug_class]] - purrr::walk(unique(group$stratification), function(leave_one_out) { + purrr::map(unique(group$stratification), function(leave_one_out) { subset <- dplyr::filter(group, stratification != leave_one_out) if (nrow(subset) == 0) { return(NULL) @@ -675,11 +675,14 @@ skipImbalancedMatrix <- function(genome_ids, ) )) arrow::write_parquet(combined, out_file) - created <<- c(created, out_file) - log("debug", paste0("Created LOO file: ", out_file)) - }) - }) + out_file + }) |> + purrr::compact() |> + unlist() + }) |> unlist() + + created <- c(created, new_files) } log("info", "All LOO matrices generated and saved.") diff --git a/R/ife_ml.R b/R/ife_ml.R index 3d72a6d..7358640 100644 --- a/R/ife_ml.R +++ b/R/ife_ml.R @@ -17,6 +17,16 @@ NULL #' `runMLPipeline()`, containing all features to be removed from the #' `ml_input_tibble` #' @return An amR pangenome with top features removed +#' @examples +#' ml <- tibble::tibble( +#' genome_id = paste0("g", 1:6), +#' genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 3), +#' feat_a = c(1L, 0L, 1L, 0L, 1L, 0L), +#' feat_b = c(0L, 1L, 0L, 1L, 0L, 1L), +#' feat_c = c(1L, 1L, 0L, 0L, 1L, 0L) +#' ) +#' top <- tibble::tibble(Variable = c("feat_a", "feat_c")) +#' removeTopFeats(ml, top) #' @export removeTopFeats <- function(ml_input_tibble, top_feat_tibble) { .checkArgTibble(ml_input_tibble) @@ -60,6 +70,15 @@ removeTopFeats <- function(ml_input_tibble, top_feat_tibble) { #' @param verbose [bool] The function will stay quiet if set to `FALSE`. #' @return A tibble with IFE performance (note: this will be returned within a #' list along with top features removed per iteration if `return_feats = TRUE`.) +#' @examples +#' data(demo_ml_tibble) +#' set.seed(1) +#' runIFE( +#' demo_ml_tibble, +#' by_num = TRUE, by_vi = FALSE, +#' percent_removal_vec = c(25, 50), mix_vec = 0, +#' return_feats = TRUE, verbose = FALSE +#' ) #' @export runIFE <- function( ml_input_tibble, by_num = TRUE, by_vi = FALSE, diff --git a/R/plot_ml.R b/R/plot_ml.R index 90121f3..c99886c 100644 --- a/R/plot_ml.R +++ b/R/plot_ml.R @@ -30,6 +30,24 @@ NULL #' @param test_data_plus_predictions Test data (tibble) with an added column for #' predicted phenotype labels, such as the output of `predict()`. #' @return A precision-recall curve as a `ggplot2` object +#' @examples +#' preds <- tibble::tibble( +#' genome_id = paste0("g", 1:10), +#' genome_drug.resistant_phenotype = factor( +#' rep(c("Resistant", "Susceptible"), each = 5), +#' levels = c("Resistant", "Susceptible") +#' ), +#' .pred_class = factor( +#' c( +#' "Resistant", "Resistant", "Susceptible", "Resistant", "Susceptible", +#' "Susceptible", "Resistant", "Susceptible", "Susceptible", "Resistant" +#' ), +#' levels = c("Resistant", "Susceptible") +#' ), +#' .pred_Resistant = c(0.9, 0.8, 0.4, 0.7, 0.3, 0.2, 0.6, 0.1, 0.2, 0.55), +#' .pred_Susceptible = c(0.1, 0.2, 0.6, 0.3, 0.7, 0.8, 0.4, 0.9, 0.8, 0.45) +#' ) +#' plotPRC(preds) #' @export plotPRC <- function(test_data_plus_predictions) { .checkArgTestDataPlusPredictions(test_data_plus_predictions) @@ -54,6 +72,9 @@ plotPRC <- function(test_data_plus_predictions) { #' @param fit Best model fit, such as the output of `fitBestModel()` #' @param n_top_feats [num] Number of top features to plot #' @return Variable importance plot (a `ggplot2` object) +#' @examples +#' data(demo_fit) +#' plotTopFeatsVI(demo_fit, n_top_feats = 10) #' @export plotTopFeatsVI <- function(fit, n_top_feats = 10) { .checkArgWflow(fit) @@ -83,6 +104,17 @@ plotTopFeatsVI <- function(fit, n_top_feats = 10) { #' @param ylab [chr] Label for y axis #' @return A `ggplot2` scatterplot (performance metric or runtime vs. #' `train_prop` or `n_fold`), colored by model +#' @examples +#' default_eval <- tibble::tibble( +#' train_prop = c(0.5, 0.6, 0.7, 0.5, 0.6, 0.7), +#' n_fold = rep(5, 6), +#' model = rep(c("LR", "RF"), each = 3), +#' avg_f1_score = c(0.72, 0.78, 0.83, 0.70, 0.75, 0.80) +#' ) +#' plotDefaultEval(default_eval, +#' x_default_eval = "train_prop", +#' y_default_eval = "avg_f1_score" +#' ) #' @export plotDefaultEval <- function( default_eval_tibble, x_default_eval = "train_prop", @@ -138,6 +170,16 @@ plotDefaultEval <- function( #' @param shuffled_label_results Output of `runMLPipeline()` #' (`shuffle_labels = TRUE`) #' @return A bar plot with balanced accuracy comparisons per antibiotic +#' @examples +#' non_shuffled <- tibble::tibble( +#' antibiotic = c("AMP", "CIP", "CRO"), +#' bal_acc = c(0.88, 0.81, 0.92) +#' ) +#' shuffled <- tibble::tibble( +#' antibiotic = c("AMP", "CIP", "CRO"), +#' bal_acc = c(0.52, 0.49, 0.55) +#' ) +#' plotBaselineComparison(non_shuffled, shuffled) #' @export plotBaselineComparison <- function( non_shuffled_label_results, @@ -200,10 +242,22 @@ plotBaselineComparison <- function( #' Labels are applied only to the top-ranked features to preserve clarity. #' #' @examples -#' \dontrun{ -#' plotFishers(fisher_results) -#' plotFishers(fisher_results, label_top_n = 0) -#' } +#' long <- tibble::tibble( +#' genome_id = rep(paste0("g", 1:10), each = 2), +#' feature_id = rep(c("gene_a", "gene_b"), 10), +#' value = c( +#' 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, +#' 0, 0, 0, 1, 0, 1, 0, 1, 0, 0 +#' ), +#' genome_drug.resistant_phenotype = rep( +#' rep(c("Resistant", "Susceptible"), each = 5), +#' each = 2 +#' ) +#' ) +#' tmp <- tempfile(fileext = ".parquet") +#' arrow::write_parquet(long, tmp) +#' fisher_results <- runFishers(tmp, Q = 0.05) +#' plotFishers(fisher_results, alpha = 0.05, label_top_n = 2) #' #' @import ggplot2 #' @import dplyr diff --git a/R/prep_ml.R b/R/prep_ml.R index 4a5954e..ef3bbff 100644 --- a/R/prep_ml.R +++ b/R/prep_ml.R @@ -73,6 +73,19 @@ NULL #' upstream processing. #' @returns A tibble (one row per genome, one column per feature), plus #' "Resistant" or "Susceptible" AMR phenotype labels +#' @examples +#' long <- tibble::tibble( +#' genome_id = rep(paste0("g", 1:6), each = 2), +#' feature_id = rep(c("gene_a", "gene_b"), 6), +#' value = c(1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1), +#' genome_drug.resistant_phenotype = rep( +#' rep(c("Resistant", "Susceptible"), each = 3), +#' each = 2 +#' ) +#' ) +#' tmp <- tempfile(fileext = ".parquet") +#' arrow::write_parquet(long, tmp) +#' loadMLInputTibble(tmp) #' @export loadMLInputTibble <- function(parquet_path) { .checkArgPath(parquet_path) @@ -135,6 +148,14 @@ loadMLInputTibble <- function(parquet_path) { #' (multi-class classification for determining the drug classes to which each #' genome is resistant), but not both. #' @return Number of features +#' @examples +#' ml <- tibble::tibble( +#' genome_id = paste0("g", 1:6), +#' genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 3), +#' feat_a = c(1L, 0L, 1L, 0L, 1L, 0L), +#' feat_b = c(0L, 1L, 0L, 1L, 0L, 1L) +#' ) +#' getNumFeat(ml) #' @export getNumFeat <- function(ml_input_tibble) { .checkArgTibble(ml_input_tibble, ml = TRUE) @@ -168,19 +189,27 @@ getNumFeat <- function(ml_input_tibble) { #' genome is resistant), but not both. #' @param seed [num] For reproducible analysis #' @return Pangenome with randomly shuffled AMR phenotype labels +#' @examples +#' ml <- tibble::tibble( +#' genome_id = paste0("g", 1:6), +#' genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 3), +#' feat_a = c(1L, 0L, 1L, 0L, 1L, 0L) +#' ) +#' shuffleLabels(ml, seed = 42) #' @export shuffleLabels <- function(ml_input_tibble, seed = 5280) { .checkArgTibble(ml_input_tibble, ml = TRUE) .checkArgSeed(seed) - set.seed(seed) target_var <- .getTargetVarName(ml_input_tibble) log <- .ml_logger("debug") log("debug", "Shuffling phenotype labels (for baseline models).") - ml_input_tibble |> - dplyr::mutate(!!target_var := sample(!!target_var)) + withr::with_seed(seed, { + ml_input_tibble |> + dplyr::mutate(!!target_var := sample(!!target_var)) + }) } #' calculateMinSamples() @@ -190,6 +219,9 @@ shuffleLabels <- function(ml_input_tibble, seed = 5280) { #' @param smallest_n_obs_rs [numeric] Minimum number of observations of the #' rarer class required per fold/partition. Default is 1. #' @return Minimum total observations required, adjusted by smallest_n_obs_rs +#' @examples +#' calculateMinSamples(n_fold = 5, split = c(1, 0), res_prop = 0.3) +#' calculateMinSamples(n_fold = 5, split = c(0.6, 0.2), res_prop = 0.1) #' @export calculateMinSamples <- function(n_fold, split, res_prop, smallest_n_obs_rs = 1) { base <- .calculateMinSamples(n_fold, split, res_prop) @@ -210,6 +242,16 @@ calculateMinSamples <- function(n_fold, split, res_prop, smallest_n_obs_rs = 1) #' @return The name of the target variable to be used for machine learning: #' either `rlang::sym("genome_drug.resistant_phenotype")` or #' `rlang::sym("resistant_classes")` +#' @examples +#' ml <- tibble::tibble( +#' genome_id = paste0("g", 1:4), +#' genome_drug.resistant_phenotype = c( +#' "Resistant", "Susceptible", +#' "Resistant", "Susceptible" +#' ), +#' feat_a = c(1L, 0L, 1L, 0L) +#' ) +#' .getTargetVarName(ml) #' @export .getTargetVarName <- function(ml_input_tibble) { .checkArgTibble(ml_input_tibble, ml = TRUE) diff --git a/R/run_Fisher_tests.R b/R/run_Fisher_tests.R index e0cf7f5..28b6ea0 100644 --- a/R/run_Fisher_tests.R +++ b/R/run_Fisher_tests.R @@ -24,6 +24,18 @@ NULL #' @return A list with: #' - `df`: the input data with AMR phenotype now encoded as integer #' - `target`: a binarized vector of the encoded AMR phenotypes +#' @examples +#' df <- tibble::tibble( +#' genome_id = paste0("g", 1:6), +#' genome_drug.resistant_phenotype = c( +#' "Resistant", "Susceptible", "Resistant", +#' "Susceptible", "Resistant", "Susceptible" +#' ), +#' gene_a = c(1L, 0L, 1L, 0L, 1L, 0L), +#' gene_b = c(0L, 1L, 1L, 0L, 0L, 1L) +#' ) +#' encoded <- encodePhenotype(df) +#' encoded$target #' @export encodePhenotype <- function(df, susceptible_label = "Susceptible", resistant_label = "Resistant") { stopifnot("genome_drug.resistant_phenotype" %in% colnames(df)) # Throw a fit if it's the wrong matrix @@ -49,6 +61,15 @@ encodePhenotype <- function(df, susceptible_label = "Susceptible", resistant_lab #' @param alternative Type of Fisher test: "two.sided", "greater", or "less" #' #' @return A tibble with columns: gene, p_value +#' @examples +#' df <- tibble::tibble( +#' genome_id = paste0("g", 1:10), +#' genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 5), +#' gene_a = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0), +#' gene_b = c(0, 0, 0, 1, 1, 1, 1, 1, 1, 0) +#' ) +#' encoded <- encodePhenotype(df) +#' runFisherTests(encoded$df, encoded$target) #' @export runFisherTests <- function(df, target, alternative = "two.sided") { feature_cols <- df |> @@ -77,6 +98,12 @@ runFisherTests <- function(df, target, alternative = "two.sided") { #' @param Q The FDR threshold (default = 0.05) #' #' @return Returns Fisher results with added cols for adj_p_value, sig_after_bh, Q +#' @examples +#' fisher_results <- tibble::tibble( +#' gene = paste0("gene_", 1:5), +#' p_value = c(0.001, 0.01, 0.04, 0.2, 0.5) +#' ) +#' applyBenjaminiHochberg(fisher_results, Q = 0.05) #' @export applyBenjaminiHochberg <- function(df_fisher, Q = 0.05) { stopifnot("p_value" %in% colnames(df_fisher)) @@ -102,6 +129,19 @@ applyBenjaminiHochberg <- function(df_fisher, Q = 0.05) { #' @param target The binarized AMR phenotype "target" output from before #' #' @return The BH Fisher data table with added feature frequency columns +#' @examples +#' df <- tibble::tibble( +#' genome_id = paste0("g", 1:10), +#' genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 5), +#' gene_a = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0), +#' gene_b = c(0, 0, 0, 1, 1, 1, 1, 1, 1, 0) +#' ) +#' encoded <- encodePhenotype(df) +#' fisher <- applyBenjaminiHochberg( +#' runFisherTests(encoded$df, encoded$target), +#' Q = 0.05 +#' ) +#' computeFeatureFreq(encoded$df, fisher, encoded$target) #' @export computeFeatureFreq <- function(df, df_fisher, target) { df_features <- df |> @@ -135,6 +175,22 @@ computeFeatureFreq <- function(df, df_fisher, target) { #' @param resistant_label "Resistant" phenotype label in the data #' #' @return The Fisher test results, BH correction, and feature frequencies +#' @examples +#' long <- tibble::tibble( +#' genome_id = rep(paste0("g", 1:10), each = 2), +#' feature_id = rep(c("gene_a", "gene_b"), 10), +#' value = c( +#' 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, +#' 0, 0, 0, 1, 0, 1, 0, 1, 0, 0 +#' ), +#' genome_drug.resistant_phenotype = rep( +#' rep(c("Resistant", "Susceptible"), each = 5), +#' each = 2 +#' ) +#' ) +#' tmp <- tempfile(fileext = ".parquet") +#' arrow::write_parquet(long, tmp) +#' runFishers(tmp, Q = 0.05) #' @export runFishers <- function( matrix_path, diff --git a/R/run_ML.R b/R/run_ML.R index 2ed07e7..e85d42e 100644 --- a/R/run_ML.R +++ b/R/run_ML.R @@ -1,7 +1,8 @@ -#' @keywords internal #' Pulls the ML parameters .json and reads what model split parameters are to be #' used. These defaults can be overridden if you so choose, but consider regenerating #' the ML matrices with these new split/CV values instead. +#' +#' @keywords internal #' @noRd .resolveSplitParams <- function(parquet_path, defaults = list( @@ -49,19 +50,11 @@ #' } #' #' @examples -#' \dontrun{ -#' # Basic directory structure -#' paths <- createMLResultDir("/path/to/results") -#' -#' # LOO analysis stratified by year -#' paths_loo <- createMLResultDir("/path/to/results", -#' stratify_by = "year", -#' LOO = TRUE -#' ) -#' -#' # MDR analysis -#' paths_mdr <- createMLResultDir("/path/to/results", MDR = TRUE) -#' } +#' out_dir <- file.path(tempdir(), "amRml_createdir_example") +#' dir.create(out_dir, showWarnings = FALSE, recursive = TRUE) +#' createMLResultDir(out_dir) +#' createMLResultDir(out_dir, stratify_by = "year", LOO = TRUE) +#' createMLResultDir(out_dir, MDR = TRUE) #' #' @export createMLResultDir <- function(path, @@ -533,7 +526,7 @@ createMLinputList <- function(path, #' Run MDR (multi-drug resistance) machine learning models #' #' Executes machine learning pipeline for MDR analysis using logistic regression -#' with parallel processing via the future backend. Trains models on all MDR +#' with parallel processing via BiocParallel. Trains models on all MDR #' parquet files and saves results to designated output directories. #' #' @param path Character scalar. Base directory containing MDR matrix files. @@ -549,6 +542,7 @@ createMLinputList <- function(path, #' @param use_saved_split Logical. Whether to inherit split/seed/n_fold from ml_parameters.json. Default TRUE. #' @param shuffle_labels Logical. Randomly shuffle labels for baseline runs. Default FALSE. #' @param use_pca Logical. Use PCA on predictors. Default FALSE. +#' @param seed Integer. Seed for the parallel RNG streams (`BiocParallel`). Default 5280. #' #' @return NULL (invisible). Called for side effects (model training and result saving). #' @@ -587,7 +581,8 @@ runMDRmodels <- function(path, return_pred = TRUE, use_saved_split = TRUE, shuffle_labels = FALSE, - use_pca = FALSE) { + use_pca = FALSE, + seed = 5280) { files <- createMLinputList(path, stratify_by = NULL, LOO = FALSE, @@ -600,21 +595,19 @@ runMDRmodels <- function(path, return(invisible(NULL)) } - old_plan <- future::plan() - on.exit(future::plan(old_plan), add = TRUE) - future::plan(future::multisession, workers = threads) + param <- BiocParallel::SnowParam(workers = threads, type = "SOCK", RNGseed = seed) if (isTRUE(verbose)) { - nw <- tryCatch(future::nbrOfWorkers(), error = function(e) NA_integer_) - message("runMDRmodels(): enabling multisession with workers = ", nw) + message("runMDRmodels(): enabling SnowParam with workers = ", threads) } # Auto tags for shuffled and PCA shuffle_tag <- if (isTRUE(shuffle_labels)) "shuffled_" else "" pca_tag <- if (isTRUE(use_pca)) paste0("_pca", as.character(pca_threshold)) else "" - results_list <- future.apply::future_lapply( + results_list <- BiocParallel::bplapply( seq_len(nrow(files)), + BPPARAM = param, FUN = function(i) { ref_parquet <- files$ref_file[i] output_prefix <- files$output_prefix[i] @@ -697,8 +690,7 @@ runMDRmodels <- function(path, } NULL - }, - future.seed = TRUE + } ) if (verbose) { @@ -734,6 +726,7 @@ runMDRmodels <- function(path, #' @param use_saved_split Logical. Whether to inherit split/seed/n_fold from ml_parameters.json. Default TRUE. #' @param shuffle_labels Logical. Randomly shuffle labels for baseline runs. Default FALSE. #' @param use_pca Logical. Use PCA on predictors. Default FALSE. +#' @param seed Integer. Seed for the parallel RNG streams (`BiocParallel`). Default 5280. #' #' @return NULL (invisible). Called for side effects (model training and result saving). #' @@ -790,8 +783,7 @@ runMDRmodels <- function(path, #' @note #' This function requires the following packages: #' \itemize{ -#' \item \pkg{future} - for parallel processing backend -#' \item \pkg{future.apply} - for parallel lapply +#' \item \pkg{BiocParallel} - for parallel processing backend #' \item \pkg{readr} - for reading/writing TSV files #' \item \pkg{dplyr}, \pkg{purrr}, \pkg{stringr}, \pkg{tibble} - for data manipulation #' } @@ -854,7 +846,8 @@ runMLmodels <- function(path, return_pred = TRUE, use_saved_split = TRUE, shuffle_labels = FALSE, - use_pca = FALSE) { + use_pca = FALSE, + seed = 5280) { if (!is.null(stratify_by)) { if (!is.character(stratify_by) || length(stratify_by) != 1L) { stop("`stratify_by` must be NULL or a single string: 'year' or 'country'.") @@ -876,13 +869,10 @@ runMLmodels <- function(path, return(invisible(NULL)) } - old_plan <- future::plan() - on.exit(future::plan(old_plan), add = TRUE) - future::plan(future::multisession, workers = threads) + param <- BiocParallel::SnowParam(workers = threads, type = "SOCK", RNGseed = seed) if (isTRUE(verbose)) { - nw <- tryCatch(future::nbrOfWorkers(), error = function(e) NA_integer_) - message("runMLmodels(): enabling multisession with workers = ", nw) + message("runMLmodels(): enabling SnowParam with workers = ", threads) } # LOO / cross-test configuration prefix @@ -909,8 +899,9 @@ runMLmodels <- function(path, shuffle_tag <- if (isTRUE(shuffle_labels)) "shuffled_" else "" pca_tag <- if (isTRUE(use_pca)) paste0("_pca", as.character(pca_threshold)) else "" - results_list <- future.apply::future_lapply( + results_list <- BiocParallel::bplapply( seq_len(nrow(files)), + BPPARAM = param, FUN = function(i) { ref_parquet <- files$ref_file[i] output_prefix <- files$output_prefix[i] @@ -1004,8 +995,7 @@ runMLmodels <- function(path, } NULL - }, - future.seed = TRUE + } ) if (verbose) { @@ -1040,6 +1030,16 @@ runMLmodels <- function(path, #' #' @return Invisibly returns the output directory used for ML results. #' +#' @examples +#' \dontrun{ +#' runModelingPipeline( +#' parquet_duckdb_path = "path/to/Bug_parquet.duckdb", +#' threads = 16, n_fold = 5, split = c(1, 0), min_n = 25, +#' prop_vi_top_feats = c(0, 1), pca_threshold = 0.99, +#' verbose = TRUE, use_saved_split = TRUE +#' ) +#' } +#' #' @export runModelingPipeline <- function(parquet_duckdb_path, threads = 16, diff --git a/R/run_ml_pipeline.R b/R/run_ml_pipeline.R index ad9f691..eab8dcc 100644 --- a/R/run_ml_pipeline.R +++ b/R/run_ml_pipeline.R @@ -74,6 +74,15 @@ NULL #' `top_feat_tibble`. Tuning results, the fit object, and model predictions may #' also be returned if `return_tune_res`, `return_fit`, and/or `return_pred`, #' respectively, are set to `TRUE`. +#' @examples +#' data(demo_ml_tibble) +#' set.seed(1) +#' runMLPipeline( +#' ml_input_tibble = demo_ml_tibble, model = "LR", +#' split = c(1, 0), n_fold = 2, +#' penalty_vec = 10^c(-3, -1), mix_vec = c(0, 0.5, 1), +#' n_top_feats = 10, verbose = FALSE +#' ) #' @export runMLPipeline <- function( ml_input_tibble, model = "LR", split = c(0.6, 0.2), @@ -91,7 +100,11 @@ runMLPipeline <- function( .checkArgReturnTuneRes(return_tune_res) .checkArgReturnFit(return_fit) .checkArgReturnPred(return_pred) + .checkArgSeed(seed) + # Seed once for the whole pipeline so the split, CV folds, tuning, and fit + # share one continuous RNG stream (restored to the caller's state on exit). + withr::local_seed(seed) # Set `n_fold` to `NA` if not using cross-validation. if (split[2] != 0) { @@ -214,7 +227,7 @@ runMLPipeline <- function( dplyr::select(where(~ !any(is.na(.)))) } - data_split <- splitMLInputTibble(ml_input_tibble, split = split, seed = seed) + data_split <- splitMLInputTibble(ml_input_tibble, split = split) # Now correct `data_split` if external `test_data` is provided. if (external_test_data & split[2] != 0) { diff --git a/data/demo_fit.rda b/data/demo_fit.rda new file mode 100644 index 0000000..b3fa4ea Binary files /dev/null and b/data/demo_fit.rda differ diff --git a/data/demo_ml_tibble.rda b/data/demo_ml_tibble.rda new file mode 100644 index 0000000..a09f699 Binary files /dev/null and b/data/demo_ml_tibble.rda differ diff --git a/inst/scripts/make_demo_data.R b/inst/scripts/make_demo_data.R new file mode 100644 index 0000000..22e9ea1 --- /dev/null +++ b/inst/scripts/make_demo_data.R @@ -0,0 +1,54 @@ +## Build the data() fixtures used by amRml example blocks: +## demo_ml_tibble : 60-row, 80-feature ml_input_tibble (Sfl, AMP, genes, binary) +## demo_fit : a tuned LR workflow fitted on demo_ml_tibble +## +## Run from the package root: +## Rscript inst/scripts/make_demo_data.R + +suppressPackageStartupMessages({ + library(amRml) + library(dplyr) + library(rsample) +}) + +set.seed(42) +#Loading actual sfl data +fixture <- system.file("extdata", "Sfl_parquet.duckdb", package = "amRml") +out_dir <- file.path(tempdir(), "amRml_demo_build") +if (dir.exists(out_dir)) unlink(out_dir, recursive = TRUE) +dir.create(out_dir, recursive = TRUE) + +#Generating the inputs that will be used as dataset to load for tests +generateMLInputs( + parquet_duckdb_path = fixture, + out_path = out_dir, + n_fold = 5, + split = c(1, 0), + min_n = 25, + verbosity = "minimal" +) + +target <- "genome_drug.resistant_phenotype" +full <- loadMLInputTibble(file.path( + out_dir, "matrix", "Sfl_drug_AMP_genes_binary_sparse.parquet" +)) + +#Subsetting to save on size and to run tests quickly +demo_ml_tibble <- full |> + dplyr::group_by(.data[[target]]) |> + dplyr::slice_sample(n = 30) |> + dplyr::ungroup() |> + dplyr::select(dplyr::all_of(c( + "genome_id", target, + head(setdiff(colnames(full), c("genome_id", target)), 80) + ))) + +data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +train_data <- rsample::training(data_split) + +wflow <- buildWflow(buildLRModel(), buildRecipe(train_data, use_pca = FALSE)) +grid <- buildTuningGrid("LR", penalty_vec = 10^c(-3, -1), mix_vec = c(0, 0.5, 1)) +tune_res <- tuneGrid(wflow, data_split, grid, n_fold = 2) +demo_fit <- fitBestModel(selectBestModel(tune_res, wflow, "mcc"), train_data) + +usethis::use_data(demo_ml_tibble, demo_fit, overwrite = TRUE, compress = "xz") diff --git a/man/amRml-package.Rd b/man/amRml-package.Rd index bdd7a8b..50cd609 100644 --- a/man/amRml-package.Rd +++ b/man/amRml-package.Rd @@ -19,6 +19,11 @@ Useful links: \author{ \strong{Maintainer}: Janani Ravi \email{janani.ravi@cuanschutz.edu} (\href{https://orcid.org/0000-0001-7443-925X}{ORCID}) +Authors: +\itemize{ + \item Janani Ravi \email{janani.ravi@cuanschutz.edu} (\href{https://orcid.org/0000-0001-7443-925X}{ORCID}) +} + Other contributors: \itemize{ \item Ethan Wolfe \email{ethan.wolfe@cuanschutz.edu} [contributor] diff --git a/man/applyBenjaminiHochberg.Rd b/man/applyBenjaminiHochberg.Rd index 75867ee..40e5897 100644 --- a/man/applyBenjaminiHochberg.Rd +++ b/man/applyBenjaminiHochberg.Rd @@ -17,3 +17,10 @@ Returns Fisher results with added cols for adj_p_value, sig_after_bh, Q \description{ applyBenjaminiHochberg } +\examples{ +fisher_results <- tibble::tibble( + gene = paste0("gene_", 1:5), + p_value = c(0.001, 0.01, 0.04, 0.2, 0.5) +) +applyBenjaminiHochberg(fisher_results, Q = 0.05) +} diff --git a/man/buildLRModel.Rd b/man/buildLRModel.Rd index 3ed514e..344617e 100644 --- a/man/buildLRModel.Rd +++ b/man/buildLRModel.Rd @@ -7,7 +7,7 @@ buildLRModel(multi_class = FALSE) } \arguments{ -\item{multi_class}{\link[arrow:data-type]{arrow::bool} Whether to construct a model for multi-class +\item{multi_class}{\link[arrow:bool]{bool} Whether to construct a model for multi-class classification} } \value{ @@ -16,3 +16,7 @@ A \code{parsnip} \code{logistic_reg} object \description{ Builds a logistic regression model. } +\examples{ +buildLRModel() +buildLRModel(multi_class = TRUE) +} diff --git a/man/buildRecipe.Rd b/man/buildRecipe.Rd index 5c2d790..704dd1e 100644 --- a/man/buildRecipe.Rd +++ b/man/buildRecipe.Rd @@ -11,9 +11,9 @@ buildRecipe(train_data, use_pca = FALSE, pca_threshold = 0.95) training. This can be the output of \code{rsample::training(splitMLInputTibble(ml_input_tibble))}.} -\item{use_pca}{\link[arrow:data-type]{arrow::bool} Set to \code{TRUE} to use PCA instead of all features.} +\item{use_pca}{\link[arrow:bool]{bool} Set to \code{TRUE} to use PCA instead of all features.} -\item{pca_threshold}{\link[pillar:num]{pillar::num} The proportion of total variance for which the +\item{pca_threshold}{\link[pillar:num]{num} The proportion of total variance for which the principle components account} } \value{ @@ -22,3 +22,12 @@ A \code{recipe} object \description{ Specify predictors, outcome, and metadata by building a recipe. } +\examples{ +train <- tibble::tibble( + genome_id = paste0("g", 1:10), + genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 5), + feat_a = rep(c(0L, 1L), 5), + feat_b = rep(c(1L, 0L), 5) +) +buildRecipe(train, use_pca = FALSE) +} diff --git a/man/buildTuningGrid.Rd b/man/buildTuningGrid.Rd index 498f310..1841efe 100644 --- a/man/buildTuningGrid.Rd +++ b/man/buildTuningGrid.Rd @@ -11,13 +11,13 @@ buildTuningGrid( ) } \arguments{ -\item{model}{\link[rlang:vector-construction]{rlang::chr} Currently, logistic regression ("LR") is supported.} +\item{model}{\link[rlang:chr]{chr} Currently, logistic regression ("LR") is supported.} -\item{penalty_vec}{\link[pillar:num]{pillar::num} A vector containing \code{penalty} (regularization +\item{penalty_vec}{\link[pillar:num]{num} A vector containing \code{penalty} (regularization strength) values to try (for logistic regression). Recommended range: 10^-4 to 10^4.} -\item{mix_vec}{\link[pillar:num]{pillar::num} A vector containing \code{mixture} values to try for logistic +\item{mix_vec}{\link[pillar:num]{num} A vector containing \code{mixture} values to try for logistic regression. 0 corresponds to L2 regularization; 1 corresponds to L1; intermediate values (0, 1) correspond to elastic net.} } @@ -27,3 +27,10 @@ A logistic regression tuning grid as a tibble \description{ Builds a tuning grid according to the input model. } +\examples{ +buildTuningGrid( + model = "LR", + penalty_vec = 10^c(-3, -1), + mix_vec = c(0, 0.5, 1) +) +} diff --git a/man/buildWflow.Rd b/man/buildWflow.Rd index 1bf7932..5791562 100644 --- a/man/buildWflow.Rd +++ b/man/buildWflow.Rd @@ -18,3 +18,14 @@ A \code{workflow} object \description{ Builds a \code{tidymodels} workflow based on an input model and recipe. } +\examples{ +train <- tibble::tibble( + genome_id = paste0("g", 1:10), + genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 5), + feat_a = rep(c(0L, 1L), 5), + feat_b = rep(c(1L, 0L), 5) +) +rec <- buildRecipe(train, use_pca = FALSE) +lr <- buildLRModel() +buildWflow(lr, rec) +} diff --git a/man/calculateEvalMets.Rd b/man/calculateEvalMets.Rd index a06b7e7..0506645 100644 --- a/man/calculateEvalMets.Rd +++ b/man/calculateEvalMets.Rd @@ -19,3 +19,22 @@ accuracy, normalized (to a 0 to 1 scale instead of -1 to 1) Matthews correlation coefficient (nMCC), and log2(AUPRC/prior) based on the AMR phenotype predictions by an ML model compared against the actual values. } +\examples{ +preds <- tibble::tibble( + genome_id = paste0("g", 1:10), + genome_drug.resistant_phenotype = factor( + rep(c("Resistant", "Susceptible"), each = 5), + levels = c("Resistant", "Susceptible") + ), + .pred_class = factor( + c( + "Resistant", "Resistant", "Susceptible", "Resistant", "Susceptible", + "Susceptible", "Resistant", "Susceptible", "Susceptible", "Resistant" + ), + levels = c("Resistant", "Susceptible") + ), + .pred_Resistant = c(0.9, 0.8, 0.4, 0.7, 0.3, 0.2, 0.6, 0.1, 0.2, 0.55), + .pred_Susceptible = c(0.1, 0.2, 0.6, 0.3, 0.7, 0.8, 0.4, 0.9, 0.8, 0.45) +) +calculateEvalMets(preds) +} diff --git a/man/calculateMinSamples.Rd b/man/calculateMinSamples.Rd index 66a18e1..98347cd 100644 --- a/man/calculateMinSamples.Rd +++ b/man/calculateMinSamples.Rd @@ -25,3 +25,7 @@ Returns the minimum number of total observations needed (one bug-drug combo) to have at least one observation of the rarer class in each CV fold or, in validation mode, across train/val/test. } +\examples{ +calculateMinSamples(n_fold = 5, split = c(1, 0), res_prop = 0.3) +calculateMinSamples(n_fold = 5, split = c(0.6, 0.2), res_prop = 0.1) +} diff --git a/man/computeFeatureFreq.Rd b/man/computeFeatureFreq.Rd index a0b33f1..961a507 100644 --- a/man/computeFeatureFreq.Rd +++ b/man/computeFeatureFreq.Rd @@ -19,3 +19,17 @@ The BH Fisher data table with added feature frequency columns \description{ computeFeatureFreq } +\examples{ +df <- tibble::tibble( + genome_id = paste0("g", 1:10), + genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 5), + gene_a = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0), + gene_b = c(0, 0, 0, 1, 1, 1, 1, 1, 1, 0) +) +encoded <- encodePhenotype(df) +fisher <- applyBenjaminiHochberg( + runFisherTests(encoded$df, encoded$target), + Q = 0.05 +) +computeFeatureFreq(encoded$df, fisher, encoded$target) +} diff --git a/man/createMLResultDir.Rd b/man/createMLResultDir.Rd index 2e522b2..ec032f0 100644 --- a/man/createMLResultDir.Rd +++ b/man/createMLResultDir.Rd @@ -42,17 +42,10 @@ Creates a structured directory hierarchy for storing machine learning results including matrices, performance metrics, feature importance, models, and predictions. } \examples{ -\dontrun{ -# Basic directory structure -paths <- createMLResultDir("/path/to/results") - -# LOO analysis stratified by year -paths_loo <- createMLResultDir("/path/to/results", - stratify_by = "year", - LOO = TRUE) - -# MDR analysis -paths_mdr <- createMLResultDir("/path/to/results", MDR = TRUE) -} +out_dir <- file.path(tempdir(), "amRml_createdir_example") +dir.create(out_dir, showWarnings = FALSE, recursive = TRUE) +createMLResultDir(out_dir) +createMLResultDir(out_dir, stratify_by = "year", LOO = TRUE) +createMLResultDir(out_dir, MDR = TRUE) } diff --git a/man/createMLinputList.Rd b/man/createMLinputList.Rd index 2104415..7e15eed 100644 --- a/man/createMLinputList.Rd +++ b/man/createMLinputList.Rd @@ -49,8 +49,9 @@ inputs <- createMLinputList("/path/to/results") # Cross-test with year stratification inputs_ct <- createMLinputList("/path/to/results", - stratify_by = "year", - cross_test = TRUE) + stratify_by = "year", + cross_test = TRUE +) # MDR analysis inputs_mdr <- createMLinputList("/path/to/results", MDR = TRUE) diff --git a/man/demo_fit.Rd b/man/demo_fit.Rd new file mode 100644 index 0000000..0810296 --- /dev/null +++ b/man/demo_fit.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{demo_fit} +\alias{demo_fit} +\title{Demo LR fit} +\format{ +A fitted \code{workflow} object (output of \code{\link[=fitBestModel]{fitBestModel()}}). +} +\source{ +\code{inst/scripts/make_demo_data.R}. +} +\usage{ +data(demo_fit) +} +\description{ +A tuned logistic-regression workflow fitted on \link{demo_ml_tibble}. +} +\keyword{datasets} diff --git a/man/demo_ml_tibble.Rd b/man/demo_ml_tibble.Rd new file mode 100644 index 0000000..021834d --- /dev/null +++ b/man/demo_ml_tibble.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{demo_ml_tibble} +\alias{demo_ml_tibble} +\title{Demo ML input tibble} +\format{ +A tibble with 60 rows and 82 columns: \code{genome_id}, +\code{genome_drug.resistant_phenotype}, and 80 binary feature columns. +} +\source{ +\code{inst/scripts/make_demo_data.R}. +} +\usage{ +data(demo_ml_tibble) +} +\description{ +Stratified subset (30 Resistant + 30 Susceptible) of the AMP-genes-binary +matrix from the bundled \code{Sfl_parquet.duckdb}, restricted to 80 feature +columns. +} +\keyword{datasets} diff --git a/man/dot-getTargetVarName.Rd b/man/dot-getTargetVarName.Rd index cdbf92d..f9bd512 100644 --- a/man/dot-getTargetVarName.Rd +++ b/man/dot-getTargetVarName.Rd @@ -23,3 +23,14 @@ either \code{rlang::sym("genome_drug.resistant_phenotype")} or Returns the name of the target variable to be used for machine learning: either \code{genome_drug.resistant_phenotype} or \code{resistant_classes} } +\examples{ +ml <- tibble::tibble( + genome_id = paste0("g", 1:4), + genome_drug.resistant_phenotype = c( + "Resistant", "Susceptible", + "Resistant", "Susceptible" + ), + feat_a = c(1L, 0L, 1L, 0L) +) +.getTargetVarName(ml) +} diff --git a/man/encodePhenotype.Rd b/man/encodePhenotype.Rd index 7067e7d..37dc92b 100644 --- a/man/encodePhenotype.Rd +++ b/man/encodePhenotype.Rd @@ -27,3 +27,16 @@ A list with: \description{ encodePhenotype } +\examples{ +df <- tibble::tibble( + genome_id = paste0("g", 1:6), + genome_drug.resistant_phenotype = c( + "Resistant", "Susceptible", "Resistant", + "Susceptible", "Resistant", "Susceptible" + ), + gene_a = c(1L, 0L, 1L, 0L, 1L, 0L), + gene_b = c(0L, 1L, 1L, 0L, 0L, 1L) +) +encoded <- encodePhenotype(df) +encoded$target +} diff --git a/man/extractTopFeats.Rd b/man/extractTopFeats.Rd index c6ab154..b43cf0c 100644 --- a/man/extractTopFeats.Rd +++ b/man/extractTopFeats.Rd @@ -9,14 +9,14 @@ extractTopFeats(fit, prop_vi_top_feats = c(0, 1), n_top_feats = NA) \arguments{ \item{fit}{Best model fit, such as the output of \code{fitBestModel()}} -\item{prop_vi_top_feats}{\link[pillar:num]{pillar::num} A vector of length 2 with elements together +\item{prop_vi_top_feats}{\link[pillar:num]{num} A vector of length 2 with elements together indicating the proportion of total variable importance the top features should comprise. To get the features that contribute to the top 10 to 20\% of total variable importance, for example, set \code{prop_vi_top_feats = c(0.1, 0.2)}. Returns all features by default.} -\item{n_top_feats}{\link[pillar:num]{pillar::num} Number of top features to extract} +\item{n_top_feats}{\link[pillar:num]{num} Number of top features to extract} } \value{ A tibble with a column for top features (\code{Variable}), a column for @@ -27,3 +27,7 @@ per-class columns of importance scores for each \code{Variable}) Returns the top features that an ML model found to be important for predicting the AMR phenotype of a given pathogen. } +\examples{ +data(demo_fit) +extractTopFeats(demo_fit, n_top_feats = 10) +} diff --git a/man/fitBestModel.Rd b/man/fitBestModel.Rd index 2724cb1..70a95f1 100644 --- a/man/fitBestModel.Rd +++ b/man/fitBestModel.Rd @@ -19,3 +19,16 @@ Best model fit \description{ Fits the best model (output of \code{selectBestModel()}). } +\examples{ +data(demo_ml_tibble) +data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +train <- rsample::training(data_split) +wflow <- buildWflow(buildLRModel(), buildRecipe(train)) +set.seed(1) +tune_res <- tuneGrid(wflow, data_split, + buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)), + n_fold = 2 +) +best_wflow <- selectBestModel(tune_res, wflow, "mcc") +fitBestModel(best_wflow, train) +} diff --git a/man/getConfusionMatrix.Rd b/man/getConfusionMatrix.Rd index a47b012..45af5de 100644 --- a/man/getConfusionMatrix.Rd +++ b/man/getConfusionMatrix.Rd @@ -17,3 +17,20 @@ Confusion matrix of class \code{conf_mat} Returns the confusion matrix based on the AMR phenotype predictions by an ML model compared against the actual values. } +\examples{ +preds <- tibble::tibble( + genome_id = paste0("g", 1:8), + genome_drug.resistant_phenotype = factor( + rep(c("Resistant", "Susceptible"), each = 4), + levels = c("Resistant", "Susceptible") + ), + .pred_class = factor( + c( + "Resistant", "Resistant", "Susceptible", "Resistant", + "Susceptible", "Resistant", "Susceptible", "Susceptible" + ), + levels = c("Resistant", "Susceptible") + ) +) +getConfusionMatrix(preds) +} diff --git a/man/getNumFeat.Rd b/man/getNumFeat.Rd index d616bb2..75f2565 100644 --- a/man/getNumFeat.Rd +++ b/man/getNumFeat.Rd @@ -20,3 +20,12 @@ Number of features \description{ Returns the number of features in an amR pangenome. } +\examples{ +ml <- tibble::tibble( + genome_id = paste0("g", 1:6), + genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 3), + feat_a = c(1L, 0L, 1L, 0L, 1L, 0L), + feat_b = c(0L, 1L, 0L, 1L, 0L, 1L) +) +getNumFeat(ml) +} diff --git a/man/loadMLInputTibble.Rd b/man/loadMLInputTibble.Rd index 0cdb61b..bfcfcf4 100644 --- a/man/loadMLInputTibble.Rd +++ b/man/loadMLInputTibble.Rd @@ -7,7 +7,7 @@ loadMLInputTibble(parquet_path) } \arguments{ -\item{parquet_path}{\link[rlang:vector-construction]{rlang::chr} Path to a long-format sparse Parquet file from +\item{parquet_path}{\link[rlang:chr]{chr} Path to a long-format sparse Parquet file from upstream processing.} } \value{ @@ -18,3 +18,17 @@ A tibble (one row per genome, one column per feature), plus Reads in a Parquet file as a tibble and converts it from long to wide format to prepare it for ML. } +\examples{ +long <- tibble::tibble( + genome_id = rep(paste0("g", 1:6), each = 2), + feature_id = rep(c("gene_a", "gene_b"), 6), + value = c(1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1), + genome_drug.resistant_phenotype = rep( + rep(c("Resistant", "Susceptible"), each = 3), + each = 2 + ) +) +tmp <- tempfile(fileext = ".parquet") +arrow::write_parquet(long, tmp) +loadMLInputTibble(tmp) +} diff --git a/man/plotBaselineComparison.Rd b/man/plotBaselineComparison.Rd index 2b6b94d..b212ab2 100644 --- a/man/plotBaselineComparison.Rd +++ b/man/plotBaselineComparison.Rd @@ -20,3 +20,14 @@ A bar plot with balanced accuracy comparisons per antibiotic Generates a bar plot that compares model performance with and without randomly shuffled AMR phenotype labels. } +\examples{ +non_shuffled <- tibble::tibble( + antibiotic = c("AMP", "CIP", "CRO"), + bal_acc = c(0.88, 0.81, 0.92) +) +shuffled <- tibble::tibble( + antibiotic = c("AMP", "CIP", "CRO"), + bal_acc = c(0.52, 0.49, 0.55) +) +plotBaselineComparison(non_shuffled, shuffled) +} diff --git a/man/plotDefaultEval.Rd b/man/plotDefaultEval.Rd index 25eeaf9..ca2dcfe 100644 --- a/man/plotDefaultEval.Rd +++ b/man/plotDefaultEval.Rd @@ -15,16 +15,16 @@ plotDefaultEval( \arguments{ \item{default_eval_tibble}{Output of \code{findOptimalMLDefaults()}} -\item{x_default_eval}{\link[rlang:vector-construction]{rlang::chr} x value of default evaluation plot: "train_prop" +\item{x_default_eval}{\link[rlang:chr]{chr} x value of default evaluation plot: "train_prop" or "n_fold"} -\item{y_default_eval}{\link[rlang:vector-construction]{rlang::chr} y value of default evaluation plot. It can be +\item{y_default_eval}{\link[rlang:chr]{chr} y value of default evaluation plot. It can be "avg_runtime_sec" or one of the following performance metrics: "avg_f1_score", "avg_log2_apop", "avg_bal_acc", or "avg_nmcc"} -\item{xlab}{\link[rlang:vector-construction]{rlang::chr} Label for x axis} +\item{xlab}{\link[rlang:chr]{chr} Label for x axis} -\item{ylab}{\link[rlang:vector-construction]{rlang::chr} Label for y axis} +\item{ylab}{\link[rlang:chr]{chr} Label for y axis} } \value{ A \code{ggplot2} scatterplot (performance metric or runtime vs. @@ -34,3 +34,15 @@ A \code{ggplot2} scatterplot (performance metric or runtime vs. Plots performance metric or runtime vs. training data proportion or number of cross-validation folds, colored by model. } +\examples{ +default_eval <- tibble::tibble( + train_prop = c(0.5, 0.6, 0.7, 0.5, 0.6, 0.7), + n_fold = rep(5, 6), + model = rep(c("LR", "RF"), each = 3), + avg_f1_score = c(0.72, 0.78, 0.83, 0.70, 0.75, 0.80) +) +plotDefaultEval(default_eval, + x_default_eval = "train_prop", + y_default_eval = "avg_f1_score" +) +} diff --git a/man/plotFishers.Rd b/man/plotFishers.Rd index ba515df..fa4729e 100644 --- a/man/plotFishers.Rd +++ b/man/plotFishers.Rd @@ -35,9 +35,21 @@ Color explicitly encodes whether a feature passes the BH threshold. Labels are applied only to the top-ranked features to preserve clarity. } \examples{ -\dontrun{ -plotFishers(fisher_results) -plotFishers(fisher_results, label_top_n = 0) -} +long <- tibble::tibble( + genome_id = rep(paste0("g", 1:10), each = 2), + feature_id = rep(c("gene_a", "gene_b"), 10), + value = c( + 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, + 0, 0, 0, 1, 0, 1, 0, 1, 0, 0 + ), + genome_drug.resistant_phenotype = rep( + rep(c("Resistant", "Susceptible"), each = 5), + each = 2 + ) +) +tmp <- tempfile(fileext = ".parquet") +arrow::write_parquet(long, tmp) +fisher_results <- runFishers(tmp, Q = 0.05) +plotFishers(fisher_results, alpha = 0.05, label_top_n = 2) } diff --git a/man/plotPRC.Rd b/man/plotPRC.Rd index e1d3cc4..3bad222 100644 --- a/man/plotPRC.Rd +++ b/man/plotPRC.Rd @@ -17,3 +17,22 @@ A precision-recall curve as a \code{ggplot2} object Plots the precision-recall curve given a set of test data plus predicted AMR phenotypes. } +\examples{ +preds <- tibble::tibble( + genome_id = paste0("g", 1:10), + genome_drug.resistant_phenotype = factor( + rep(c("Resistant", "Susceptible"), each = 5), + levels = c("Resistant", "Susceptible") + ), + .pred_class = factor( + c( + "Resistant", "Resistant", "Susceptible", "Resistant", "Susceptible", + "Susceptible", "Resistant", "Susceptible", "Susceptible", "Resistant" + ), + levels = c("Resistant", "Susceptible") + ), + .pred_Resistant = c(0.9, 0.8, 0.4, 0.7, 0.3, 0.2, 0.6, 0.1, 0.2, 0.55), + .pred_Susceptible = c(0.1, 0.2, 0.6, 0.3, 0.7, 0.8, 0.4, 0.9, 0.8, 0.45) +) +plotPRC(preds) +} diff --git a/man/plotTopFeatsVI.Rd b/man/plotTopFeatsVI.Rd index e7d3008..9bd920b 100644 --- a/man/plotTopFeatsVI.Rd +++ b/man/plotTopFeatsVI.Rd @@ -9,7 +9,7 @@ plotTopFeatsVI(fit, n_top_feats = 10) \arguments{ \item{fit}{Best model fit, such as the output of \code{fitBestModel()}} -\item{n_top_feats}{\link[pillar:num]{pillar::num} Number of top features to plot} +\item{n_top_feats}{\link[pillar:num]{num} Number of top features to plot} } \value{ Variable importance plot (a \code{ggplot2} object) @@ -18,3 +18,7 @@ Variable importance plot (a \code{ggplot2} object) Generates a plot showing the top features and their variable importance scores. } +\examples{ +data(demo_fit) +plotTopFeatsVI(demo_fit, n_top_feats = 10) +} diff --git a/man/predictML.Rd b/man/predictML.Rd index 2da4b8e..53a882c 100644 --- a/man/predictML.Rd +++ b/man/predictML.Rd @@ -21,3 +21,9 @@ labels Predicts the AMR phenotype in the test data set based on the best fit ML model. } +\examples{ +data(demo_ml_tibble) +data(demo_fit) +data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +predictML(demo_fit, rsample::testing(data_split)) +} diff --git a/man/removeTopFeats.Rd b/man/removeTopFeats.Rd index a6fe1ea..fe4bd79 100644 --- a/man/removeTopFeats.Rd +++ b/man/removeTopFeats.Rd @@ -19,3 +19,14 @@ An amR pangenome with top features removed \description{ Removes top features from a pangenome given previous ML results. } +\examples{ +ml <- tibble::tibble( + genome_id = paste0("g", 1:6), + genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 3), + feat_a = c(1L, 0L, 1L, 0L, 1L, 0L), + feat_b = c(0L, 1L, 0L, 1L, 0L, 1L), + feat_c = c(1L, 1L, 0L, 0L, 1L, 0L) +) +top <- tibble::tibble(Variable = c("feat_a", "feat_c")) +removeTopFeats(ml, top) +} diff --git a/man/runFisherTests.Rd b/man/runFisherTests.Rd index 96c1fa6..676b36c 100644 --- a/man/runFisherTests.Rd +++ b/man/runFisherTests.Rd @@ -19,3 +19,13 @@ A tibble with columns: gene, p_value \description{ runFisherTests } +\examples{ +df <- tibble::tibble( + genome_id = paste0("g", 1:10), + genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 5), + gene_a = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0), + gene_b = c(0, 0, 0, 1, 1, 1, 1, 1, 1, 0) +) +encoded <- encodePhenotype(df) +runFisherTests(encoded$df, encoded$target) +} diff --git a/man/runFishers.Rd b/man/runFishers.Rd index f654d2d..a39b095 100644 --- a/man/runFishers.Rd +++ b/man/runFishers.Rd @@ -29,3 +29,20 @@ The Fisher test results, BH correction, and feature frequencies \description{ runFishers } +\examples{ +long <- tibble::tibble( + genome_id = rep(paste0("g", 1:10), each = 2), + feature_id = rep(c("gene_a", "gene_b"), 10), + value = c( + 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, + 0, 0, 0, 1, 0, 1, 0, 1, 0, 0 + ), + genome_drug.resistant_phenotype = rep( + rep(c("Resistant", "Susceptible"), each = 5), + each = 2 + ) +) +tmp <- tempfile(fileext = ".parquet") +arrow::write_parquet(long, tmp) +runFishers(tmp, Q = 0.05) +} diff --git a/man/runIFE.Rd b/man/runIFE.Rd index 312ceba..2e39c9e 100644 --- a/man/runIFE.Rd +++ b/man/runIFE.Rd @@ -19,28 +19,28 @@ runIFE( \arguments{ \item{ml_input_tibble}{An ML-ready tibble generated by \code{loadMLInputTibble()}} -\item{by_num}{\link[arrow:data-type]{arrow::bool} Set to \code{TRUE} if removing top features as a percentage +\item{by_num}{\link[arrow:bool]{bool} Set to \code{TRUE} if removing top features as a percentage of the total number of features.} -\item{by_vi}{\link[arrow:data-type]{arrow::bool} Set to \code{TRUE} if removing top features by their +\item{by_vi}{\link[arrow:bool]{bool} Set to \code{TRUE} if removing top features by their contribution to the total variable importance.} -\item{percent_removal_vec}{\link[pillar:num]{pillar::num} A vector of percentages of total top +\item{percent_removal_vec}{\link[pillar:num]{num} A vector of percentages of total top features removed (if \code{by_num = TRUE} and \code{by_vi = FALSE}) or percent contributions of top features removed to the total variable importance (if \code{by_vi = TRUE} and \code{by_num = FALSE}) at each iteration. The function will automatically train with all features to start, so there is no need to specify 0 in this vector.} -\item{mix_vec}{\link[pillar:num]{pillar::num} A vector containing mixture values (0 corresponds +\item{mix_vec}{\link[pillar:num]{num} A vector containing mixture values (0 corresponds to L2 regularization; 1 corresponds to L1; intermediate values correspond to elastic net) to try. The default corresponds to L2 for the purpose of IFE analysis.} -\item{return_feats}{\link[arrow:data-type]{arrow::bool} Whether to return top features from each iteration +\item{return_feats}{\link[arrow:bool]{bool} Whether to return top features from each iteration of IFE.} -\item{verbose}{\link[arrow:data-type]{arrow::bool} The function will stay quiet if set to \code{FALSE}.} +\item{verbose}{\link[arrow:bool]{bool} The function will stay quiet if set to \code{FALSE}.} } \value{ A tibble with IFE performance (note: this will be returned within a @@ -51,3 +51,13 @@ runIFE Removes top features identified by ML models and retrains iteratively; returns nMCC at each iteration. } +\examples{ +data(demo_ml_tibble) +set.seed(1) +runIFE( + demo_ml_tibble, + by_num = TRUE, by_vi = FALSE, + percent_removal_vec = c(25, 50), mix_vec = 0, + return_feats = TRUE, verbose = FALSE +) +} diff --git a/man/runMDRmodels.Rd b/man/runMDRmodels.Rd index 325c14a..73df41d 100644 --- a/man/runMDRmodels.Rd +++ b/man/runMDRmodels.Rd @@ -17,7 +17,8 @@ runMDRmodels( return_pred = TRUE, use_saved_split = TRUE, shuffle_labels = FALSE, - use_pca = FALSE + use_pca = FALSE, + seed = 5280 ) } \arguments{ @@ -46,13 +47,15 @@ runMDRmodels( \item{shuffle_labels}{Logical. Randomly shuffle labels for baseline runs. Default FALSE.} \item{use_pca}{Logical. Use PCA on predictors. Default FALSE.} + +\item{seed}{Integer. Seed for the parallel RNG streams (\code{BiocParallel}). Default 5280.} } \value{ NULL (invisible). Called for side effects (model training and result saving). } \description{ Executes machine learning pipeline for MDR analysis using logistic regression -with parallel processing via the future backend. Trains models on all MDR +with parallel processing via BiocParallel. Trains models on all MDR parquet files and saves results to designated output directories. } \examples{ @@ -62,13 +65,15 @@ runMDRmodels("/path/to/results") # Run with more threads and minimal output runMDRmodels("/path/to/results", - threads = 32, - verbose = FALSE) + threads = 32, + verbose = FALSE +) # Run without saving model fits (save disk space) runMDRmodels("/path/to/results", - threads = 16, - return_fit = FALSE) + threads = 16, + return_fit = FALSE +) } } diff --git a/man/runMLPipeline.Rd b/man/runMLPipeline.Rd index 31c542e..2b53951 100644 --- a/man/runMLPipeline.Rd +++ b/man/runMLPipeline.Rd @@ -35,10 +35,10 @@ classification for one bug/drug combination) or \code{resistant_classes} (multi-class classification for determining the drug classes to which each genome is resistant), but not both.} -\item{model}{\link[rlang:vector-construction]{rlang::chr} Logistic regression ("LR"), random forest ("RF"), or +\item{model}{\link[rlang:chr]{chr} Logistic regression ("LR"), random forest ("RF"), or boosted tree ("BT")} -\item{split}{\link[pillar:num]{pillar::num} Vector of length 2 indicating the proportion of data to +\item{split}{\link[pillar:num]{num} Vector of length 2 indicating the proportion of data to be designated as training and validation, respectively. Note: if \code{test_data} is provided, these numbers will be scaled so that they sum to 1 and will still represent fractions of \code{ml_input_tibble} (not including the input @@ -47,26 +47,26 @@ function is not equipped to handle this. If cross-validation is enabled here \code{split = c(1,0)}, we will still retain a 20\% test holdout for final reporting. Cross-validation is run on the 80\% training portion, and not on the testing set.} -\item{n_fold}{\link[pillar:num]{pillar::num} Number of folds of cross-validation} +\item{n_fold}{\link[pillar:num]{num} Number of folds of cross-validation} -\item{prop_vi_top_feats}{\link[pillar:num]{pillar::num} A vector of length 2 with elements together +\item{prop_vi_top_feats}{\link[pillar:num]{num} A vector of length 2 with elements together indicating the proportion of total variable importance the top features should comprise. To get the features that contribute to the top 10 to 20\% of total variable importance, for example, \verb{set prop_vi_top_feats = c(0.1, 0.2)}. Returns all features by default.} -\item{n_top_feats}{\link[pillar:num]{pillar::num} Number of top features to extract per drug} +\item{n_top_feats}{\link[pillar:num]{num} Number of top features to extract per drug} -\item{use_pca}{\link[arrow:data-type]{arrow::bool} Set to \code{TRUE} to use PCA instead of all features.} +\item{use_pca}{\link[arrow:bool]{bool} Set to \code{TRUE} to use PCA instead of all features.} -\item{pca_threshold}{\link[pillar:num]{pillar::num} The proportion of total variance for which the +\item{pca_threshold}{\link[pillar:num]{num} The proportion of total variance for which the principle components account} -\item{penalty_vec}{\link[pillar:num]{pillar::num} A vector containing \code{penalty} (regularization +\item{penalty_vec}{\link[pillar:num]{num} A vector containing \code{penalty} (regularization strength) values to try (for logistic regression). It is recommended to choose values \code{10^-4} to \code{10^4}.} -\item{mix_vec}{\link[pillar:num]{pillar::num} A vector containing \code{mixture} values to try for logistic +\item{mix_vec}{\link[pillar:num]{num} A vector containing \code{mixture} values to try for logistic regression. 0 corresponds to L2 regularization; 1 corresponds to L1; intermediate values correspond to elastic net.} @@ -78,12 +78,12 @@ or boosted tree. It is recommended to choose values in the range 1 to 100.} \code{trees} in random forest or boosted tree. It is recommended to choose values in the range 100 to 1000.} -\item{select_best_metric}{\link[rlang:vector-construction]{rlang::chr} Metric to select best model: "f_meas", +\item{select_best_metric}{\link[rlang:chr]{chr} Metric to select best model: "f_meas", "pr_auc", or "bal_accuracy"} -\item{seed}{\link[pillar:num]{pillar::num} For reproducible analysis} +\item{seed}{\link[pillar:num]{num} For reproducible analysis} -\item{shuffle_labels}{\link[arrow:data-type]{arrow::bool} Set to \code{TRUE} to randomly shuffle AMR phenotype +\item{shuffle_labels}{\link[arrow:bool]{bool} Set to \code{TRUE} to randomly shuffle AMR phenotype labels for baseline comparisons.} \item{test_data}{A tibble to use as testing data instead of a subset of @@ -91,14 +91,14 @@ labels for baseline comparisons.} or temporal holdouts. The \code{split} argument still tells how \code{ml_input_tibble} should be divided for training and validation.} -\item{return_tune_res}{\link[arrow:data-type]{arrow::bool} Set to \code{TRUE} to return tuning results.} +\item{return_tune_res}{\link[arrow:bool]{bool} Set to \code{TRUE} to return tuning results.} -\item{return_fit}{\link[arrow:data-type]{arrow::bool} Set to \code{TRUE} to return the model \code{fit}.} +\item{return_fit}{\link[arrow:bool]{bool} Set to \code{TRUE} to return the model \code{fit}.} -\item{return_pred}{\link[arrow:data-type]{arrow::bool} Set to \code{TRUE} to return the predicted and actual +\item{return_pred}{\link[arrow:bool]{bool} Set to \code{TRUE} to return the predicted and actual AMR phenotypes.} -\item{verbose}{\link[arrow:data-type]{arrow::bool} The function will stay quiet if set to \code{FALSE}.} +\item{verbose}{\link[arrow:bool]{bool} The function will stay quiet if set to \code{FALSE}.} } \value{ A list with two elements: A \code{performance_tibble} and a @@ -109,3 +109,13 @@ respectively, are set to \code{TRUE}. \description{ Stitches together core ML functions into one pipeline. } +\examples{ +data(demo_ml_tibble) +set.seed(1) +runMLPipeline( + ml_input_tibble = demo_ml_tibble, model = "LR", + split = c(1, 0), n_fold = 2, + penalty_vec = 10^c(-3, -1), mix_vec = c(0, 0.5, 1), + n_top_feats = 10, verbose = FALSE +) +} diff --git a/man/runMLmodels.Rd b/man/runMLmodels.Rd index cb44718..2020be2 100644 --- a/man/runMLmodels.Rd +++ b/man/runMLmodels.Rd @@ -20,7 +20,8 @@ runMLmodels( return_pred = TRUE, use_saved_split = TRUE, shuffle_labels = FALSE, - use_pca = FALSE + use_pca = FALSE, + seed = 5280 ) } \arguments{ @@ -55,6 +56,8 @@ runMLmodels( \item{shuffle_labels}{Logical. Randomly shuffle labels for baseline runs. Default FALSE.} \item{use_pca}{Logical. Use PCA on predictors. Default FALSE.} + +\item{seed}{Integer. Seed for the parallel RNG streams (\code{BiocParallel}). Default 5280.} } \value{ NULL (invisible). Called for side effects (model training and result saving). @@ -121,8 +124,7 @@ For example: \code{"LOO_cross_test_ML_year_performance.tsv"} \note{ This function requires the following packages: \itemize{ -\item \pkg{future} - for parallel processing backend -\item \pkg{future.apply} - for parallel lapply +\item \pkg{BiocParallel} - for parallel processing backend \item \pkg{readr} - for reading/writing TSV files \item \pkg{dplyr}, \pkg{purrr}, \pkg{stringr}, \pkg{tibble} - for data manipulation } @@ -143,21 +145,24 @@ runMLmodels("/path/to/results", stratify_by = "year") # Cross-test with year stratification runMLmodels("/path/to/results", - stratify_by = "year", - cross_test = TRUE, - threads = 32) + stratify_by = "year", + cross_test = TRUE, + threads = 32 +) # LOO analysis stratified by country with cross-testing runMLmodels("/path/to/results", - stratify_by = "country", - LOO = TRUE, - cross_test = TRUE, - verbose = TRUE) + stratify_by = "country", + LOO = TRUE, + cross_test = TRUE, + verbose = TRUE +) # Run without saving model fits (save disk space) runMLmodels("/path/to/results", - stratify_by = "year", - return_fit = FALSE) + stratify_by = "year", + return_fit = FALSE +) } } diff --git a/man/runModelingPipeline.Rd b/man/runModelingPipeline.Rd index 84ea022..a0763be 100644 --- a/man/runModelingPipeline.Rd +++ b/man/runModelingPipeline.Rd @@ -49,3 +49,14 @@ Given a DuckDB file produced by \code{runDataProcessing()}, it: \item Saves performance metrics, fitted models, predictions, and top feature rankings } } +\examples{ +\dontrun{ +runModelingPipeline( + parquet_duckdb_path = "path/to/Bug_parquet.duckdb", + threads = 16, n_fold = 5, split = c(1, 0), min_n = 25, + prop_vi_top_feats = c(0, 1), pca_threshold = 0.99, + verbose = TRUE, use_saved_split = TRUE +) +} + +} diff --git a/man/selectBestModel.Rd b/man/selectBestModel.Rd index 23232bf..e507795 100644 --- a/man/selectBestModel.Rd +++ b/man/selectBestModel.Rd @@ -11,7 +11,7 @@ selectBestModel(tune_res, wflow, select_best_metric = "mcc") \item{wflow}{A \code{workflows} object, such as the output of \code{buildWflow()}} -\item{select_best_metric}{\link[rlang:vector-construction]{rlang::chr} Metric to select best model: "f_meas", +\item{select_best_metric}{\link[rlang:chr]{chr} Metric to select best model: "f_meas", "pr_auc", "mcc", or "bal_accuracy"} } \value{ @@ -20,3 +20,17 @@ Best model workflow \description{ Selects best model by F1 score, AUPRC, MCC, or balanced accuracy. } +\examples{ +data(demo_ml_tibble) +data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +wflow <- buildWflow( + buildLRModel(), + buildRecipe(rsample::training(data_split)) +) +set.seed(1) +tune_res <- tuneGrid(wflow, data_split, + buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)), + n_fold = 2 +) +selectBestModel(tune_res, wflow, "mcc") +} diff --git a/man/shuffleLabels.Rd b/man/shuffleLabels.Rd index 9c672b5..83a6c72 100644 --- a/man/shuffleLabels.Rd +++ b/man/shuffleLabels.Rd @@ -14,7 +14,7 @@ classification for one bug/drug combination) or \code{resistant_classes} (multi-class classification for determining the drug classes to which each genome is resistant), but not both.} -\item{seed}{\link[pillar:num]{pillar::num} For reproducible analysis} +\item{seed}{\link[pillar:num]{num} For reproducible analysis} } \value{ Pangenome with randomly shuffled AMR phenotype labels @@ -22,3 +22,11 @@ Pangenome with randomly shuffled AMR phenotype labels \description{ Randomly shuffles the AMR phenotype labels in the ML-ready tibble. } +\examples{ +ml <- tibble::tibble( + genome_id = paste0("g", 1:6), + genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 3), + feat_a = c(1L, 0L, 1L, 0L, 1L, 0L) +) +shuffleLabels(ml, seed = 42) +} diff --git a/man/splitMLInputTibble.Rd b/man/splitMLInputTibble.Rd index 7f774b9..12b324b 100644 --- a/man/splitMLInputTibble.Rd +++ b/man/splitMLInputTibble.Rd @@ -4,7 +4,7 @@ \alias{splitMLInputTibble} \title{splitMLInputTibble()} \usage{ -splitMLInputTibble(ml_input_tibble, split = c(0.6, 0.2), seed = 5280) +splitMLInputTibble(ml_input_tibble, split = c(0.6, 0.2), seed = NULL) } \arguments{ \item{ml_input_tibble}{An ML-ready tibble generated by \code{loadMLInputTibble()}. @@ -14,10 +14,13 @@ classification for one bug/drug combination) or \code{resistant_classes} (multi-class classification for determining the drug classes to which each genome is resistant), but not both.} -\item{split}{\link[pillar:num]{pillar::num} Vector of length 2 indicating the proportion of data to +\item{split}{\link[pillar:num]{num} Vector of length 2 indicating the proportion of data to be designated as training and validation, respectively.} -\item{seed}{\link[pillar:num]{pillar::num} For reproducible analysis} +\item{seed}{\link[pillar:num]{num} Optional. If supplied, the split is seeded (and the +caller's RNG state restored afterward) for standalone reproducibility. When +\code{NULL} (the default, as used by \code{runMLPipeline()}), the split inherits the +ambient RNG stream so it can share one seed with downstream tuning and fitting.} } \value{ An \code{rsplit} object @@ -26,3 +29,12 @@ An \code{rsplit} object Splits an ML-ready tibble into training and testing (and in some cases validation) sets. } +\examples{ +ml <- tibble::tibble( + genome_id = paste0("g", 1:20), + genome_drug.resistant_phenotype = rep(c("Resistant", "Susceptible"), each = 10), + feat_a = rep(c(0L, 1L), 10), + feat_b = rep(c(1L, 0L), 10) +) +splitMLInputTibble(ml, split = c(1, 0), seed = 42) +} diff --git a/man/tuneGrid.Rd b/man/tuneGrid.Rd index d03a8c3..9d985a5 100644 --- a/man/tuneGrid.Rd +++ b/man/tuneGrid.Rd @@ -15,7 +15,7 @@ tuneGrid(wflow, data_split, grid = buildTuningGrid(model = "LR"), n_fold = 5) \item{grid}{A tuning grid as a tibble, such as the output of \code{buildTuningGrid()}} -\item{n_fold}{\link[pillar:num]{pillar::num} Number of folds of cross-validation} +\item{n_fold}{\link[pillar:num]{num} Number of folds of cross-validation} } \value{ Results of grid tuning @@ -24,3 +24,14 @@ Results of grid tuning Tunes the model according to the tuning grid, workflow, and data split specified. } +\examples{ +data(demo_ml_tibble) +data_split <- splitMLInputTibble(demo_ml_tibble, split = c(1, 0), seed = 1) +wflow <- buildWflow( + buildLRModel(), + buildRecipe(rsample::training(data_split)) +) +grid <- buildTuningGrid("LR", 10^c(-3, -1), c(0, 0.5, 1)) +set.seed(1) +tuneGrid(wflow, data_split, grid, n_fold = 2) +} diff --git a/tests/testthat/test-pipeline.R b/tests/testthat/test-pipeline.R index 6bb0043..8333942 100644 --- a/tests/testthat/test-pipeline.R +++ b/tests/testthat/test-pipeline.R @@ -19,6 +19,7 @@ minimal_grid_args <- list( ) run_pipeline <- function(fx, split = c(1, 0), n_fold = 2, ...) { + set.seed(1) suppressMessages(suppressWarnings( runMLPipeline( fx, diff --git a/vignettes/intro.Rmd b/vignettes/intro.Rmd index af5bc8e..be27d7a 100644 --- a/vignettes/intro.Rmd +++ b/vignettes/intro.Rmd @@ -1,17 +1,19 @@ --- title: "Introduction to amRml" -output: rmarkdown::html_vignette +output: + BiocStyle::html_document: + fig_width: 7 + fig_height: 5 vignette: > %\VignetteIndexEntry{Introduction to amRml} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- -```{r, include = FALSE} +```{r setup-opts, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, - comment = "#>", - eval = FALSE + comment = "#>" ) ``` @@ -21,354 +23,285 @@ library(amRml) ## Overview -`amRml` is part of a 3-package suite for predicting antimicrobial resistance (AMR) using machine learning models trained on bacterial genomic features. +`amRml` predicts antimicrobial resistance (AMR) from bacterial genomic features. It consumes a DuckDB produced by `amRdata` and produces ML matrices, tuned logistic regression models, per-genome predictions, feature importances, and Fisher's exact tests as a non-ML baseline. -This vignette demonstrates how to generate ML input matrices, train models, and visualize results. +This vignette uses the *Shigella flexneri* (`Sfl`) DuckDB bundled in `inst/extdata`. + +```{r fixture} +fixture <- system.file("extdata", "Sfl_parquet.duckdb", package = "amRml") +out_dir <- file.path(tempdir(), "amRml_vignette") +dir.create(out_dir, showWarnings = FALSE, recursive = TRUE) +``` ## Generating ML input matrices -After data curation with `amRdata`, use `generateMLInputs()` to create ML-ready matrices from the DuckDB containing metadata and feature parquets. -The DuckDB with the parquet views is created with `cleanData()` +`generateMLInputs()` reads the bug-level DuckDB (metadata + feature parquets) and writes one long-format sparse parquet per drug × feature × encoding combination into `out_path/matrix/`. With `stratify_by`, it additionally writes year- or country-stratified matrices into `matrix_year/` or `matrix_country/`. ```{r generate-matrices} -# Generate all ML input matrices from curated data generateMLInputs( - parquet_duckdb_path = "results/Sfl_parquet.duckdb", - out_path = "results/", - n_fold = 5, - split = c(1, 0), # CV mode (or use split = 0 as shorthand) - min_n = 25, - verbosity = "minimal" + parquet_duckdb_path = fixture, + out_path = out_dir, + n_fold = 5, + split = c(1, 0), + min_n = 25, + verbosity = "minimal" ) -``` - -This generates: - -- Drug and drug-class matrices for all feature types (genes, proteins, domains, struct) -- Year-stratified and country-stratified matrices -- Leave-one-out (LOO) matrices for temporal/geographic holdouts -- Multi-drug resistance (MDR) matrices - -For classical train/validation/test splits instead of cross-validation: -```{r generate-splits} -# 70% train, 15% validation, 15% test -generateMLInputs( - parquet_duckdb_path = "results/Sfl_parquet.duckdb", - out_path = "results/", - n_fold = NULL, - split = c(0.7, 0.15), - verbosity = "debug" -) +list.files(file.path(out_dir, "matrix"))[1:5] ``` -Use `skipImbalancedMatrix()` to check if a drug-bug combination has sufficient samples: +For a classical train/validation/test split instead of cross-validation, set `n_fold = NULL` and pass a length-2 `split` of `c(train_prop, val_prop)`; the test proportion is the remainder. -```{r skip-check} -skip <- skipImbalancedMatrix( - genome_ids = genome_ids, - phenotype_counts = phenotype_counts, - n_fold = 5, - split = c(1, 0), - min_total_obs = 40 +```{r generate-splits, eval = FALSE} +generateMLInputs( + parquet_duckdb_path = fixture, + out_path = out_dir, + n_fold = NULL, + split = c(0.7, 0.15), + verbosity = "debug" ) ``` -## Loading ML matrices +## Loading a single matrix -The package expects Parquet files in long (sparse) format with these columns: +The generated parquets are long-format sparse tibbles with these columns: | Column | Description | |--------|-------------| | `genome_id` | Unique identifier for each isolate | | `feature_id` | Feature name (gene, protein, domain, or struct) | -| `value` | Binary presence/absence (0 or 1) or count | -| `genome_drug.resistant_phenotype` | "Resistant" or "Susceptible" | +| `value` | Binary presence/absence (0/1) or count | +| `genome_drug.resistant_phenotype` | `"Resistant"` or `"Susceptible"` | -`loadMLInputTibble()` converts this to wide format (one row per genome, one column per feature) for ML. +`loadMLInputTibble()` converts one of them to wide format (one row per genome, one column per feature) ready for ML. ```{r load-data} -# Load a generated matrix -ml_tibble <- loadMLInputTibble( - parquet_path = "results/matrix/Sfl_drug_AMP_domains_binary_sparse.parquet" +matrix_path <- file.path( + out_dir, "matrix", "Sfl_drug_AMP_genes_binary_sparse.parquet" ) -# Check dimensions +ml_tibble <- loadMLInputTibble(matrix_path) n_features <- getNumFeat(ml_tibble) -target_var <- getTargetVarName(ml_tibble) +target_var <- .getTargetVarName(ml_tibble) + +c(n_features = n_features, target_var = target_var) ``` -## Main pipeline +## Per-matrix pipeline -The primary entry point is `runMLPipeline()`, which orchestrates the entire ML workflow: +`runMLPipeline()` runs the train/tune/fit/predict pipeline on a single matrix in memory. Use it to iterate on one drug-feature combo before scaling to all of them. ```{r run-pipeline} results <- runMLPipeline( - ml_input_tibble = ml_tibble, - model = "LR", - split = c(0.6, 0.2), - n_fold = 2, - n_top_feats = 20, + ml_input_tibble = ml_tibble, + model = "LR", + split = c(1, 0), + n_fold = 2, + n_top_feats = 20, + penalty_vec = 10^c(-3, -1), + mix_vec = c(0, 0.5, 1), select_best_metric = "mcc", - return_fit = TRUE, - return_pred = TRUE, - verbose = TRUE + return_fit = TRUE, + return_pred = TRUE, + verbose = FALSE ) -# View results results$performance_tibble -results$top_feat_tibble +head(results$top_feat_tibble) ``` ### Output structure -`runMLPipeline()` returns a list with: +`runMLPipeline()` returns a named list: -**`performance_tibble`** - model performance metrics: +**`performance_tibble`** — one row of model performance metrics: | Column | Description | |--------|-------------| | `num_obs` | Number of observations | | `res_prop` | Proportion of resistant samples | | `n_feat` | Number of features | -| `model` | Model type (LR) | +| `model` | Model type (`"LR"`) | | `train_prop`, `val_prop` | Train/validation split proportions | | `fit_penalty`, `fit_mixture` | Fitted hyperparameters | | `nmcc`, `f1`, `bal_acc`, `log2_apop` | Performance metrics | | `run_time_sec` | Runtime in seconds | -**`top_feat_tibble`** - ranked feature importance: +**`top_feat_tibble`** — ranked feature importance: | Column | Description | |--------|-------------| | `Variable` | Feature name | | `Importance` | Variable importance score | -| `Sign` | Direction of effect (POS = resistance, NEG = susceptibility) | +| `Sign` | Direction of effect (`POS` = associated with resistance, `NEG` = with susceptibility) | **Optional outputs** (when `return_*` = TRUE): -- `tune_res` - tuning results from grid search -- `fit` - fitted model workflow object -- `pred` - predictions with `.pred_class`, `.pred_Resistant`, `.pred_Susceptible` +- `tune_res` — tuning results from grid search +- `fit` — the fitted workflow object +- `pred` — predictions with `.pred_class`, `.pred_Resistant`, `.pred_Susceptible` ## Step-by-step model building -For more control, use the individual tidymodels-based functions: +The builders below are what `runMLPipeline()` chains internally. Call them directly when you need control over any individual step. ```{r split-data} -# Split data data_split <- splitMLInputTibble(ml_tibble, split = c(0.6, 0.2), seed = 123) - -library(rsample) -train_data <- training(data_split) -test_data <- testing(data_split) +train_data <- rsample::training(data_split) +test_data <- rsample::testing(data_split) ``` ```{r build-model} -# Build recipe (preprocessing) -recipe <- buildRecipe(train_data, use_pca = FALSE, pca_threshold = 0.95) +recipe <- buildRecipe(train_data, use_pca = FALSE) +lr_mod <- buildLRModel(multi_class = FALSE) +wflow <- buildWflow(lr_mod, recipe) -# Build logistic regression model -lr_model <- buildLRModel(multi_class = FALSE) - -# Build workflow -wflow <- buildWflow(lr_model, recipe) - -# Build tuning grid grid <- buildTuningGrid( - model = "LR", - penalty_vec = 10^seq(-4, -1, length.out = 10), - mix_vec = 0:5 / 5 + model = "LR", + penalty_vec = 10^c(-3, -1), + mix_vec = c(0, 0.5, 1) ) ``` ```{r tune-fit} -# Tune hyperparameters -tune_res <- tuneGrid(wflow, data_split, grid, n_fold = 5) - -# Select and fit best model +tune_res <- tuneGrid(wflow, data_split, grid, n_fold = 2) best_wflow <- selectBestModel(tune_res, wflow, select_best_metric = "mcc") fit <- fitBestModel(best_wflow, train_data) -# Get fitted hyperparameters -fit_hps <- getFitHps(fit) - -# Make predictions -predictions <- predict(fit, test_data) +preds <- predictML(fit, test_data) ``` ## Performance metrics +`calculateEvalMets()` returns all of nMCC, F1, balanced accuracy, AUPRC, log2(AUPRC/prior), sensitivity, and specificity from a tibble of predictions + truth. + ```{r metrics} -# Individual metrics -nmcc <- calculatenMCC(predictions) # Normalized MCC (0-1 scale) -f1 <- calculateF1(predictions) # F1 score -bal_acc <- calculateBalAcc(predictions) # Balanced accuracy -auprc <- calculateAUPRC(predictions) # Area under PR curve -log2_apop <- calculateLog2APOP(predictions) # log2(AUPRC/prior) - -# All metrics at once -all_metrics <- calculateEvalMets(predictions) - -# Confusion matrix -conf_mat <- getConfusionMatrix(predictions) +calculateEvalMets(preds) +getConfusionMatrix(preds) ``` -| Metric | Function | Description | -|--------|----------|-------------| -| nMCC | `calculatenMCC()` | Normalized Matthews correlation coefficient (0-1) | -| F1 | `calculateF1()` | Harmonic mean of precision and recall | -| Balanced accuracy | `calculateBalAcc()` | Average of sensitivity and specificity | -| AUPRC | `calculateAUPRC()` | Area under precision-recall curve | -| log2(AUPRC/prior) | `calculateLog2APOP()` | Class-imbalance adjusted AUPRC | - ## Feature importance +`extractTopFeats()` ranks features by absolute coefficient (for LR). Use `n_top_feats` for a fixed count or `prop_vi_top_feats` for a percentile range. + ```{r feature-importance} -# Extract top features from fitted model by number top_features <- extractTopFeats(fit, n_top_feats = 20) +head(top_features) +``` -# Or by proportion of variable importance -top_features <- extractTopFeats(fit, prop_vi_top_feats = c(0, 0.2)) +## Visualization -# Columns: Variable, Importance, Sign -# Positive Sign = associated with resistance -# Negative Sign = associated with susceptibility +```{r plot-prc, fig.width = 5, fig.height = 4} +plotPRC(results$pred) ``` -## Iterative feature elimination (IFE) +```{r plot-vi, fig.width = 5, fig.height = 4} +plotTopFeatsVI(results$fit, n_top_feats = 10) +``` -IFE iteratively removes top features to identify the most predictive subset: +For a baseline comparison against random labels, fit a shuffled-label pipeline and compare: -```{r ife} -ife_results <- runIFE( - ml_tibble, - by_num = TRUE, # Remove by number of features - by_vi = FALSE, # Or remove by variable importance - percent_removal_vec = 10 * 1:9, # Remove 10%, 20%, ..., 90% - mix_vec = 0, # L2 regularization for IFE - return_feats = TRUE, - verbose = TRUE +```{r plot-baseline, eval = FALSE} +shuffled <- runMLPipeline( + ml_input_tibble = ml_tibble, + model = "LR", + split = c(1, 0), + n_fold = 2, + shuffle_labels = TRUE, + return_pred = TRUE ) -# Results include nMCC at each iteration -ife_results$ife_performance_tibble -ife_results$feats_removed - -# Remove specific features manually -ml_tibble_reduced <- removeTopFeats(ml_tibble, top_features) +plotBaselineComparison( + non_shuffled_label_results = results$performance_tibble, + shuffled_label_results = shuffled$performance_tibble +) ``` -## Visualization - -### Precision-recall curve +## Iterative feature elimination -```{r plot-prc} -test_data_plus_predictions <- readr::read_tsv(results / ML_pred / Sfl_drug_AMP_domains_binary_prediction.tsv) -plotPRC(test_data_plus_predictions) -``` -### ROC curve +`runIFE()` retrains the model after iteratively removing top-ranked features, helping identify the minimal predictive subset. It runs the pipeline once per percentile in `percent_removal_vec` -```{r plot-roc} -test_data_plus_predictions <- readr::read_tsv(results / ML_pred / Sfl_drug_AMP_domains_binary_prediction.tsv) -plotROC(test_data_plus_predictions) -``` -### Variable importance plot +```{r ife, eval = FALSE} +ife_results <- runIFE( + ml_tibble, + by_num = TRUE, + by_vi = FALSE, + percent_removal_vec = 10 * 1:9, + mix_vec = 0, + return_feats = TRUE, + verbose = FALSE +) -```{r plot-vi} -topfeat <- readr::read_tsv(results / ML_top_features / Sfl_drug_AMP_domains_binary_top_features.tsv) -plotTopFeatsVI(topfeat) +ife_results$ife_performance_tibble +ife_results$feats_removed ``` -### Baseline comparison barplot -non_shuffled_label_results Output of `runMLPipeline(shuffle_labels = FALSE)` -shuffled_label_results Output of `runMLPipeline(shuffle_labels = TRUE)` -```{r plot-baseline} -# Run with shuffled labels for baseline -baseline_results <- runMLPipeline( - ml_input_tibble = ml_tibble, - model = "LR", - split = c(0.6, 0.2), - shuffle_labels = TRUE -) +`removeTopFeats()` strips a given set of features from a matrix tibble if you want to do this manually: -# Compare performance -getBaselineComparisonBarplot( - non_shuffled_label_results = results$performance_tibble, - shuffled_label_results = baseline_results$performance_tibble -) +```{r remove-feats} +trimmed <- removeTopFeats(ml_tibble, head(top_features, 5)) +ncol(ml_tibble) - ncol(trimmed) ``` -## Fisher's exact tests (baseline comparison) +## Fisher's exact tests (non-ML baseline) -As a non-ML baseline, run Fisher's exact tests with multiple testing correction: +`runFishers()` runs a Fisher's exact test of feature presence vs. phenotype for each feature, applies Benjamini–Hochberg correction, and computes per-class frequencies. ```{r fisher} -# Complete Fisher pipeline fisher_results <- runFishers( - matrix_path = "results/matrix/Cje_drug_CIP_genes_binary_sparse.parquet", - Q = 0.05, - alternative = "two.sided", + matrix_path = matrix_path, + Q = 0.05, + alternative = "two.sided", susceptible_label = "Susceptible", - resistant_label = "Resistant" + resistant_label = "Resistant" ) -# Or step-by-step: -encoded <- encodePhenotype(ml_tibble) -fisher_res <- runFisherTests(encoded$df, encoded$target, alternative = "two.sided") -fisher_res <- applyBenjaminiHochberg(fisher_res, Q = 0.05) -fisher_res <- computeFeatureFreq(encoded$df, fisher_res, encoded$target) +head(fisher_results) ``` -### Plot Fisher's results -You can label the top N features to highlight the strongest hits (default is 5) or change the BH-adjusted p-value threshold with the alpha argument - -```{r} -plotFishers(fisher_results) -plotFishers(fisher_results, alpha = 0.01, label_top_n = 5) +```{r plot-fishers, fig.width = 5, fig.height = 4} +plotFishers(fisher_results, alpha = 0.05, label_top_n = 5) ``` -## Wrapper to run all models -Given a DuckDB file produced by `runDataProcessing()`, it: -1. generates all ML feature matrices (drug, class, year, country, MDR, LOO) -2. creates all ML directory structures -3. prepares ML input lists for every mode -4. runs logistic regression ML models (standard + stratified + MDR) -5. saves performance metrics, fitted models, predictions, and top feature rankings -``` {r} -runModelingPipeline(parquet_duckdb_path, - threads = 16, - n_fold = 5, - split = c(1, 0), - min_n = 25, - prop_vi_top_feats = c(0, 1), - pca_threshold = 0.99, - verbose = TRUE, +## Training all models with `runMLmodels` + +`runMLmodels()` trains a model on every matrix produced by `generateMLInputs()` and writes performance TSVs into `out_path/ML_performance/`, predictions into `ML_pred/`, and top features into `ML_top_features/`. Takes over an hour on Sfl with default settings. + +```{r run-all-models, eval = FALSE} +runMLmodels( + path = out_dir, + stratify_by = NULL, + LOO = FALSE, + cross_test = FALSE, + threads = max(1L, parallel::detectCores() - 1L), + split = c(1, 0), + n_fold = 5, + verbose = TRUE, + return_pred = TRUE, use_saved_split = TRUE ) ``` -Merge the performance and top features of each kind of models into a parquet that will serve as starting data for `amRshiny` package - -``` {r} -buildPerformancePq( - path = "results/ML_perf", - stratify_by = NULL, - LOO = FALSE, - MDR = FALSE, - cross_test = FALSE, - out_parquet = NULL, - compression = "zstd", - verbose = TRUE -) -buildTopFeatsPq( - path = "results/ML_top_features", - stratify_by = NULL, - LOO = FALSE, - MDR = FALSE, - cross_test = FALSE, - out_parquet = NULL, - compression = "zstd", - verbose = TRUE +## End-to-end: `runModelingPipeline` + +For the full pipeline from a DuckDB to all outputs in one call: + +```{r run-modeling-pipeline, eval = FALSE} +runModelingPipeline( + parquet_duckdb_path = fixture, + threads = max(1L, parallel::detectCores() - 1L), + n_fold = 5, + split = c(1, 0), + min_n = 25, + prop_vi_top_feats = c(0, 1), + pca_threshold = 0.99, + verbose = TRUE, + use_saved_split = TRUE ) ``` + +## Session info + +```{r session-info} +sessionInfo() +```