Skip to content

Fixed the calcCounts and calcMedians Functions and Added Option to Choose Which Clustering Is Used #55

@LukaTandaric

Description

@LukaTandaric

Dear Lukas,

I have fixed the calcCounts and calcMedians functions so that the rows and columns of SCEs are read properly.
I have also added an option to pass an argument to the function that sets the (meta)clustering (from colnames(colData(SCE))) to be used when making the calculations. The argument has to be passed as a string (in "double quotes").

This function outputs counts per (meta)cluster per sample:

# Define the function that replaces calcCounts:
# SCE = The summarized experiment object the counts calculation will be done on.
# clustering = A (meta)clustering from colnames(colData(SCE)) (in quotes ""). The cell counts will be calculated for each (meta)cluster.
calcCountsFixed <- function (SCE, clustering = "cluster_id"){
    if (!is(SCE, "SummarizedExperiment")) {
        stop("Data object must be a 'SummarizedExperiment'.")
    }
    if (!(clustering %in% (colnames(colData(SCE))))) {
        stop("Data object does not contain the given clustering. Check colnames(colData(SCE)).")
    }
    coldata_df <- as.data.frame(colData(SCE))
    n_cells <- coldata_df %>% group_by(!!rlang::sym(clustering), sample_id, .drop = FALSE) %>% tally %>% complete(sample_id)
    n_cells <- acast(n_cells, paste0(clustering, " ~ sample_id"), value.var = "n", fill = 0)
    if (nrow(n_cells) < nlevels(colData(SCE)[[clustering]])) {
        missing_markers <- which(!(levels(colData(SCE)[[clustering]]) %in% rownames(n_cells)))
        n_cells_tmp <- matrix(0, nrow = length(missing_markers), ncol = ncol(n_cells))
        rownames(n_cells_tmp) <- missing_markers
        n_cells <- rbind(n_cells, n_cells_tmp)
        n_cells <- n_cells[order(as.numeric(rownames(n_cells))), , drop = FALSE]
    }
    n_cells_total <- rowSums(n_cells)
    row_data <- setNames(data.frame(factor(rownames(n_cells), levels = levels(colData(SCE)[[clustering]])), n_cells_total, stringsAsFactors = FALSE),
                         c(clustering, "n_cells"))
    col_data <- metadata(SCE)$experiment_info
    n_cells <- n_cells[, match(col_data$sample_id, colnames(n_cells)), drop = FALSE]
    stopifnot(all(col_data$sample_id == colnames(n_cells)))
    d_counts <- SummarizedExperiment(assays = list(counts = n_cells), 
                                     rowData = row_data,
                                     colData = col_data)
    d_counts
}

This function outputs the median signal values from the arcSinh-transformed signal values:

# Define the function that replaces calcMedians:
# SCE = The summarized experiment object the counts calculation will be done on.
# clustering = A (meta)clustering (in quotes "") from colnames(colData(SCE)). The cell counts will be calculated for each (meta)cluster.
calcMediansFixed <- function (SCE, clustering = "cluster_id"){
    if (!is(SCE, "SummarizedExperiment")) {
        stop("Data object must be a 'SummarizedExperiment'.")
    }
    if (!(clustering %in% (colnames(colData(SCE))))) {
        stop("Data object does not contain the given clustering. Check colnames(colData(SCE)).")
    }
    is_marker <- rowData(SCE)$marker_class != "none"
    id_type_markers <- (rowData(SCE)$marker_class == "type")[is_marker]
    id_state_markers <- (rowData(SCE)$marker_class == "state")[is_marker]
    assaydata_mx <- as.matrix(assays(SCE)[["exprs"]])
    medians <- vector("list", sum(is_marker))
    marker_names_sub <- as.character(rowData(SCE)$marker_name[is_marker])
    names(medians) <- marker_names_sub
    clus <- colData(SCE)[[clustering]]
    smp <- colData(SCE)$sample_id
    for (i in seq_along(medians)) {
        assaydata_i <- t(assaydata_mx[marker_names_sub[i], , drop = FALSE])
        assaydata_i <- as.data.frame(assaydata_i)
        temp_clus <- data.frame(clus)
        names(temp_clus) <- clustering
        assaydata_i <- cbind(assaydata_i, sample_id = smp, temp_clus)
        colnames(assaydata_i)[1] <- "value"
        med <- assaydata_i %>% group_by(!!rlang::sym(clustering), sample_id, .drop = FALSE) %>% summarize(median = median(value))
        med <- acast(med, paste0(clustering, " ~ sample_id"), value.var = "median", fill = NA)
        if (nrow(med) < nlevels(colData(SCE)[[clustering]])) {
            missing_markers <- which(!(levels(colData(SCE)[[clustering]]) %in% rownames(med)))
            med_tmp <- matrix(NA, nrow = length(missing_markers), ncol = ncol(med))
            rownames(med_tmp) <- missing_markers
            med <- rbind(med, med_tmp)
            med <- med[order(as.numeric(rownames(med))), , drop = FALSE]
        }
        medians[[i]] <- med
    }
    for (i in seq_along(medians)) {
        if (!all(rownames(medians[[i]]) == rownames(medians[[1]]))) {
            stop("Cluster IDs do not match")
        }
        if (!all(colnames(medians[[i]]) == colnames(medians[[1]]))) {
            stop("Sample IDs do not match")
        }
    }
    row_data <- setNames(data.frame(factor(rownames(medians[[1]]), levels = levels(colData(SCE)[[clustering]])), stringsAsFactors = FALSE),
                         c(clustering))
    col_data <- metadata(SCE)$experiment_info
    medians <- lapply(medians, function(m) {
        m[, match(col_data$sample_id, colnames(m)), drop = FALSE]
    })
    stopifnot(all(sapply(medians, function(m) {
        col_data$sample_id == colnames(m)
    })))
    metadata <- list(id_type_markers = id_type_markers, id_state_markers = id_state_markers)
    d_medians <- SummarizedExperiment(assays = medians, rowData = row_data, 
                                      colData = col_data, metadata = metadata)
    d_medians
}

This function outputs the median data made from raw signal values:

# The previous function outputs the median expression levels as arcSinh-transformed values with an argument of 5 (or which ever number
# you set as the value for the "cofactor" in "prepData"). In the case of CyTOF data, "cofactor = 5", which gives the formula "asinh(x/5)".
# In the following function, "[["exprs"]]" has been replaced by "[["counts"]]". This makes the function output raw, untransformed medians
# of marker expression. Be advised, arcSinh-transformed values should be used for statistical analyses due to being more comparable
# between markers and/or samples, unlike raw values, which can, for instance, vary from 5 to 5000.

# SCE = The summarized experiment object the counts calculation will be done on.
# clustering = A (meta)clustering (in quotes "") from colnames(colData(SCE)). The cell counts will be calculated for each (meta)cluster.
calcMediansFixed_raw <- function (SCE, clustering = "cluster_id"){
    if (!is(SCE, "SummarizedExperiment")) {
        stop("Data object must be a 'SummarizedExperiment'.")
    }
    if (!(clustering %in% (colnames(colData(SCE))))) {
        stop("Data object does not contain the given clustering. Check colnames(colData(SCE)).")
    }
    is_marker <- rowData(SCE)$marker_class != "none"
    id_type_markers <- (rowData(SCE)$marker_class == "type")[is_marker]
    id_state_markers <- (rowData(SCE)$marker_class == "state")[is_marker]
    assaydata_mx <- as.matrix(assays(SCE)[["counts"]])
    medians <- vector("list", sum(is_marker))
    marker_names_sub <- as.character(rowData(SCE)$marker_name[is_marker])
    names(medians) <- marker_names_sub
    clus <- colData(SCE)[[clustering]]
    smp <- colData(SCE)$sample_id
    for (i in seq_along(medians)) {
        assaydata_i <- t(assaydata_mx[marker_names_sub[i], , drop = FALSE])
        assaydata_i <- as.data.frame(assaydata_i)
        temp_clus <- data.frame(clus)
        names(temp_clus) <- clustering
        assaydata_i <- cbind(assaydata_i, sample_id = smp, temp_clus)
        colnames(assaydata_i)[1] <- "value"
        med <- assaydata_i %>% group_by(!!rlang::sym(clustering), sample_id, .drop = FALSE) %>% summarize(median = median(value))
        med <- acast(med, paste0(clustering, " ~ sample_id"), value.var = "median", fill = NA)
        if (nrow(med) < nlevels(colData(SCE)[[clustering]])) {
            missing_markers <- which(!(levels(colData(SCE)[[clustering]]) %in% rownames(med)))
            med_tmp <- matrix(NA, nrow = length(missing_markers), ncol = ncol(med))
            rownames(med_tmp) <- missing_markers
            med <- rbind(med, med_tmp)
            med <- med[order(as.numeric(rownames(med))), , drop = FALSE]
        }
        medians[[i]] <- med
    }
    for (i in seq_along(medians)) {
        if (!all(rownames(medians[[i]]) == rownames(medians[[1]]))) {
            stop("Cluster IDs do not match")
        }
        if (!all(colnames(medians[[i]]) == colnames(medians[[1]]))) {
            stop("Sample IDs do not match")
        }
    }
    row_data <- setNames(data.frame(factor(rownames(medians[[1]]), levels = levels(colData(SCE)[[clustering]])), stringsAsFactors = FALSE),
                         c(clustering))
    col_data <- metadata(SCE)$experiment_info
    medians <- lapply(medians, function(m) {
        m[, match(col_data$sample_id, colnames(m)), drop = FALSE]
    })
    stopifnot(all(sapply(medians, function(m) {
        col_data$sample_id == colnames(m)
    })))
    metadata <- list(id_type_markers = id_type_markers, id_state_markers = id_state_markers)
    d_medians <- SummarizedExperiment(assays = medians, rowData = row_data, 
                                      colData = col_data, metadata = metadata)
    d_medians
}

Please let me know if you decide to incorporate them into diffcyt.

Kind regards.
Luka Tandaric

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions