From 20689b3b4b084b086df685648d2c304c2b2577fd Mon Sep 17 00:00:00 2001 From: Timothy Sundell Date: Wed, 13 May 2026 12:05:42 +0200 Subject: [PATCH] Now prioritizes all_contig for vdj data --- R/TS_cellranger_file_collection.R | 104 ++++++++++++++++------ man/TS_collect_cellranger_files.Rd | 14 +-- man/TS_find_cellranger_samples.Rd | 14 ++- man/TS_plan_cellranger_file_collection.Rd | 17 ++-- 4 files changed, 107 insertions(+), 42 deletions(-) diff --git a/R/TS_cellranger_file_collection.R b/R/TS_cellranger_file_collection.R index 229be6d..fec6b09 100644 --- a/R/TS_cellranger_file_collection.R +++ b/R/TS_cellranger_file_collection.R @@ -6,6 +6,11 @@ #' `cellranger multi` layouts such as `outs/per_sample_outs/` and for #' project folders organised as `project//cellranger_outs`. #' +#' The filtered gene expression matrix is detected at sample level. V(D)J +#' directories are detected at run level, for example `outs/vdj_b` and +#' `outs/vdj_t`, because `cellranger multi` stores the unfiltered +#' `all_contig*` files there rather than in `per_sample_outs/`. +#' #' @param input_dir Character scalar. Project or Cell Ranger output directory to #' search. #' @param sample_regex Optional regular expression used to extract sample names @@ -17,8 +22,8 @@ #' @param vdj_dir_names Character vector of V(D)J directory names to detect. #' #' @return A tibble with one row per detected sample matrix and columns -#' `sample_id`, `sample_root`, `matrix_dir`, `vdj_b_dir`, `vdj_t_dir`, and -#' `detection_note`. +#' `sample_id`, `sample_root`, `outs_dir`, `matrix_dir`, `vdj_b_dir`, +#' `vdj_t_dir`, and `detection_note`. #' #' @examples #' project <- file.path(tempdir(), "cellranger_project") @@ -68,6 +73,7 @@ TS_find_cellranger_samples <- function( return(tibble::tibble( sample_id = character(), sample_root = character(), + outs_dir = character(), matrix_dir = character(), vdj_b_dir = character(), vdj_t_dir = character(), @@ -80,14 +86,15 @@ TS_find_cellranger_samples <- function( matrix_dir, sample_regex = sample_regex ) - vdj_dirs <- .TS_find_vdj_dirs( - inferred$sample_root, + vdj_dirs <- .TS_find_raw_vdj_dirs( + inferred$outs_dir, vdj_dir_names = vdj_dir_names ) tibble::tibble( sample_id = inferred$sample_id, sample_root = inferred$sample_root, + outs_dir = inferred$outs_dir, matrix_dir = matrix_dir, vdj_b_dir = .TS_first_or_na(vdj_dirs[basename(vdj_dirs) == "vdj_b"]), vdj_t_dir = .TS_first_or_na(vdj_dirs[basename(vdj_dirs) == "vdj_t"]), @@ -102,10 +109,12 @@ TS_find_cellranger_samples <- function( #' #' @description #' Builds a copy plan for selected Cell Ranger files without changing the file -#' system. Matrix files are planned into per-sample -#' `filtered_feature_bc_matrix` directories with their original filenames, so -#' they remain directly readable by Seurat. V(D)J files are planned into -#' per-sample `vdj_b` and `vdj_t` directories with the sample name prepended. +#' system. Matrix files are planned from sample-level filtered matrix output +#' into per-sample `filtered_feature_bc_matrix` directories with their original +#' filenames, so they remain directly readable by Seurat. V(D)J files are +#' planned from run-level unfiltered `outs/vdj_b` and `outs/vdj_t` directories +#' into per-sample `vdj_b` and `vdj_t` directories with the sample name +#' prepended. #' #' @param input_dir Character scalar. Project or Cell Ranger output directory to #' search. @@ -115,9 +124,9 @@ TS_find_cellranger_samples <- function( #' See [TS_find_cellranger_samples()]. #' @param overwrite Logical. If `FALSE`, the function errors when planned target #' files already exist. -#' @param strict Logical. If `TRUE`, existing V(D)J directories must contain all -#' requested V(D)J files. If `FALSE`, missing V(D)J files are recorded in the -#' plan and available files can still be copied. +#' @param strict Logical. If `TRUE`, existing raw V(D)J directories must contain +#' all requested `all_contig*` files. If `FALSE`, missing V(D)J files are +#' recorded in the plan and available files can still be copied. #' @param ... Additional arguments passed to [TS_find_cellranger_samples()], such #' as `matrix_dir_names` or `vdj_dir_names`. #' @@ -174,6 +183,8 @@ TS_plan_cellranger_file_collection <- function( ) } + .TS_validate_cellranger_vdj_sample_scope(samples) + dest_dir <- normalizePath(dest_dir, winslash = "/", mustWork = FALSE) matrix_files <- c("barcodes.tsv.gz", "features.tsv.gz", "matrix.mtx.gz") @@ -273,9 +284,10 @@ TS_plan_cellranger_file_collection <- function( #' #' @description #' Copies selected Cell Ranger files into a transfer-friendly destination -#' directory. Each sample receives its own folder, matrix filenames are kept -#' unchanged for Seurat compatibility, and V(D)J files are renamed with the -#' sample ID as a prefix. +#' directory. Each sample receives its own folder. Filtered matrix filenames are +#' kept unchanged for Seurat compatibility, and unfiltered V(D)J `all_contig*` +#' files are renamed with the sample ID as a prefix. Per-sample V(D)J folders +#' are intentionally not used because they contain filtered contig outputs. #' #' @param input_dir Character scalar. Project or Cell Ranger output directory to #' search. @@ -287,9 +299,9 @@ TS_plan_cellranger_file_collection <- function( #' @param confirm Logical. If `TRUE`, ask for confirmation before copying. #' @param execute Logical. If `FALSE`, return the planned copy operations without #' creating directories or copying files. -#' @param strict Logical. If `TRUE`, existing V(D)J directories must contain all -#' requested V(D)J files. If `FALSE`, missing V(D)J files are skipped and -#' recorded in the returned plan. +#' @param strict Logical. If `TRUE`, existing raw V(D)J directories must contain +#' all requested `all_contig*` files. If `FALSE`, missing V(D)J files are +#' skipped and recorded in the returned plan. #' @param ... Additional arguments passed to [TS_find_cellranger_samples()], such #' as `matrix_dir_names` or `vdj_dir_names`. #' @@ -496,6 +508,7 @@ TS_collect_cellranger_files <- function( sample_index <- marker_index + 1 sample_id <- parts[[sample_index]] sample_root <- .TS_path_from_parts(parts, sample_index) + outs_dir <- .TS_path_from_parts(parts, marker_index - 1) detection_note <- paste0(parts[[marker_index]], "/") } else { output_markers <- which(parts %in% c("cellranger_outs", "outs")) @@ -505,10 +518,12 @@ TS_collect_cellranger_files <- function( sample_index <- output_index - 1 sample_id <- parts[[sample_index]] sample_root <- .TS_path_from_parts(parts, output_index) + outs_dir <- sample_root detection_note <- paste0(parts[[output_index]], " parent directory") } else { sample_id <- basename(dirname(matrix_dir)) sample_root <- dirname(matrix_dir) + outs_dir <- sample_root detection_note <- "matrix parent directory" } } @@ -521,22 +536,19 @@ TS_collect_cellranger_files <- function( list( sample_id = sample_id, sample_root = normalizePath(sample_root, winslash = "/", mustWork = TRUE), + outs_dir = normalizePath(outs_dir, winslash = "/", mustWork = TRUE), detection_note = detection_note ) } -.TS_find_vdj_dirs <- function(sample_root, vdj_dir_names) { - if (!dir.exists(sample_root)) { +.TS_find_raw_vdj_dirs <- function(outs_dir, vdj_dir_names) { + if (!dir.exists(outs_dir)) { return(character()) } - direct <- file.path(sample_root, vdj_dir_names) + direct <- file.path(outs_dir, vdj_dir_names) direct <- direct[dir.exists(direct)] - - nested <- .TS_list_dirs(sample_root) - nested <- nested[basename(nested) %in% vdj_dir_names] - - unique(normalizePath(c(direct, nested), winslash = "/", mustWork = TRUE)) + unique(normalizePath(direct, winslash = "/", mustWork = TRUE)) } .TS_first_or_na <- function(x) { @@ -562,6 +574,48 @@ TS_collect_cellranger_files <- function( } } +.TS_validate_cellranger_vdj_sample_scope <- function(samples) { + samples_with_vdj <- dplyr::filter( + samples, + !is.na(.data$vdj_b_dir) | !is.na(.data$vdj_t_dir) + ) + + if (nrow(samples_with_vdj) == 0) { + return(invisible(samples)) + } + + outs_counts <- dplyr::summarise( + dplyr::group_by(samples_with_vdj, .data$outs_dir), + n = dplyr::n_distinct(.data$sample_id), + .groups = "drop" + ) + multi_sample_outs <- dplyr::filter(outs_counts, .data$n > 1) + + if (nrow(multi_sample_outs) == 0) { + return(invisible(samples)) + } + + affected_samples <- dplyr::filter( + samples_with_vdj, + .data$outs_dir %in% multi_sample_outs$outs_dir + ) + + stop( + "Raw V(D)J all_contig files are run-level outputs, not sample-level outputs. ", + "Multiple samples were detected under the same outs directory, so these ", + "raw V(D)J files will not be copied into per-sample folders automatically:\n", + paste( + paste0( + affected_samples$outs_dir, + " -> ", + affected_samples$sample_id + ), + collapse = "\n" + ), + call. = FALSE + ) +} + .TS_plan_one_cellranger_sample <- function( sample_row, dest_dir, diff --git a/man/TS_collect_cellranger_files.Rd b/man/TS_collect_cellranger_files.Rd index 1151fa0..6a35d81 100644 --- a/man/TS_collect_cellranger_files.Rd +++ b/man/TS_collect_cellranger_files.Rd @@ -32,9 +32,9 @@ error before copying starts.} \item{execute}{Logical. If \code{FALSE}, return the planned copy operations without creating directories or copying files.} -\item{strict}{Logical. If \code{TRUE}, existing V(D)J directories must contain all -requested V(D)J files. If \code{FALSE}, missing V(D)J files are skipped and -recorded in the returned plan.} +\item{strict}{Logical. If \code{TRUE}, existing raw V(D)J directories must contain +all requested \verb{all_contig*} files. If \code{FALSE}, missing V(D)J files are +skipped and recorded in the returned plan.} \item{...}{Additional arguments passed to \code{\link[=TS_find_cellranger_samples]{TS_find_cellranger_samples()}}, such as \code{matrix_dir_names} or \code{vdj_dir_names}.} @@ -44,9 +44,10 @@ Invisibly returns the copy plan tibble. } \description{ Copies selected Cell Ranger files into a transfer-friendly destination -directory. Each sample receives its own folder, matrix filenames are kept -unchanged for Seurat compatibility, and V(D)J files are renamed with the -sample ID as a prefix. +directory. Each sample receives its own folder. Filtered matrix filenames are +kept unchanged for Seurat compatibility, and unfiltered V(D)J \verb{all_contig*} +files are renamed with the sample ID as a prefix. Per-sample V(D)J folders +are intentionally not used because they contain filtered contig outputs. } \examples{ project <- file.path(tempdir(), "cellranger_project_collect") @@ -59,4 +60,5 @@ TS_collect_cellranger_files( dest_dir = file.path(tempdir(), "cellranger_transfer_collect"), execute = FALSE ) + } diff --git a/man/TS_find_cellranger_samples.Rd b/man/TS_find_cellranger_samples.Rd index 67f010e..cc1e05b 100644 --- a/man/TS_find_cellranger_samples.Rd +++ b/man/TS_find_cellranger_samples.Rd @@ -27,14 +27,19 @@ for. Defaults to Cell Ranger's \code{filtered_feature_bc_matrix} and } \value{ A tibble with one row per detected sample matrix and columns -\code{sample_id}, \code{sample_root}, \code{matrix_dir}, \code{vdj_b_dir}, \code{vdj_t_dir}, and -\code{detection_note}. +\code{sample_id}, \code{sample_root}, \code{outs_dir}, \code{matrix_dir}, \code{vdj_b_dir}, +\code{vdj_t_dir}, and \code{detection_note}. } \description{ Recursively searches a project folder for Cell Ranger matrix directories and infers the corresponding sample names. The function is designed for common -\code{cellranger multi} layouts such as \code{outs/per_sample_outs/} and for -project folders organised as \code{project//cellranger_outs}. +\verb{cellranger multi} layouts such as \verb{outs/per_sample_outs/} and for +project folders organised as \verb{project//cellranger_outs}. + +The filtered gene expression matrix is detected at sample level. V(D)J +directories are detected at run level, for example \code{outs/vdj_b} and +\code{outs/vdj_t}, because \verb{cellranger multi} stores the unfiltered +\verb{all_contig*} files there rather than in \verb{per_sample_outs/}. } \examples{ project <- file.path(tempdir(), "cellranger_project") @@ -43,4 +48,5 @@ dir.create(matrix_dir, recursive = TRUE, showWarnings = FALSE) file.create(file.path(matrix_dir, c("barcodes.tsv.gz", "features.tsv.gz", "matrix.mtx.gz"))) TS_find_cellranger_samples(project) + } diff --git a/man/TS_plan_cellranger_file_collection.Rd b/man/TS_plan_cellranger_file_collection.Rd index 7261cb7..944d792 100644 --- a/man/TS_plan_cellranger_file_collection.Rd +++ b/man/TS_plan_cellranger_file_collection.Rd @@ -26,9 +26,9 @@ See \code{\link[=TS_find_cellranger_samples]{TS_find_cellranger_samples()}}.} \item{overwrite}{Logical. If \code{FALSE}, the function errors when planned target files already exist.} -\item{strict}{Logical. If \code{TRUE}, existing V(D)J directories must contain all -requested V(D)J files. If \code{FALSE}, missing V(D)J files are recorded in the -plan and available files can still be copied.} +\item{strict}{Logical. If \code{TRUE}, existing raw V(D)J directories must contain +all requested \verb{all_contig*} files. If \code{FALSE}, missing V(D)J files are +recorded in the plan and available files can still be copied.} \item{...}{Additional arguments passed to \code{\link[=TS_find_cellranger_samples]{TS_find_cellranger_samples()}}, such as \code{matrix_dir_names} or \code{vdj_dir_names}.} @@ -40,10 +40,12 @@ include \code{sample_id}, \code{file_group}, \code{source_path}, \code{dest_path } \description{ Builds a copy plan for selected Cell Ranger files without changing the file -system. Matrix files are planned into per-sample -\code{filtered_feature_bc_matrix} directories with their original filenames, so -they remain directly readable by Seurat. V(D)J files are planned into -per-sample \code{vdj_b} and \code{vdj_t} directories with the sample name prepended. +system. Matrix files are planned from sample-level filtered matrix output +into per-sample \code{filtered_feature_bc_matrix} directories with their original +filenames, so they remain directly readable by Seurat. V(D)J files are +planned from run-level unfiltered \code{outs/vdj_b} and \code{outs/vdj_t} directories +into per-sample \code{vdj_b} and \code{vdj_t} directories with the sample name +prepended. } \examples{ project <- file.path(tempdir(), "cellranger_project_plan") @@ -56,4 +58,5 @@ plan <- TS_plan_cellranger_file_collection( dest_dir = file.path(tempdir(), "cellranger_transfer_plan") ) plan + }