Fixed the calcCounts and calcMedians Functions and Added Option to Choose Which Clustering Is Used

Dear Lukas,

I have fixed the `calcCounts` and `calcMedians` functions so that the rows and columns of SCEs are read properly.
I have also added an option to pass an argument to the function that sets the (meta)clustering (from `colnames(colData(SCE))`) to be used when making the calculations. The argument has to be passed as a string (in "double quotes").

This function outputs counts per (meta)cluster per sample:
```
# Define the function that replaces calcCounts:
# SCE = The summarized experiment object the counts calculation will be done on.
# clustering = A (meta)clustering from colnames(colData(SCE)) (in quotes ""). The cell counts will be calculated for each (meta)cluster.
calcCountsFixed <- function (SCE, clustering = "cluster_id"){
    if (!is(SCE, "SummarizedExperiment")) {
        stop("Data object must be a 'SummarizedExperiment'.")
    }
    if (!(clustering %in% (colnames(colData(SCE))))) {
        stop("Data object does not contain the given clustering. Check colnames(colData(SCE)).")
    }
    coldata_df <- as.data.frame(colData(SCE))
    n_cells <- coldata_df %>% group_by(!!rlang::sym(clustering), sample_id, .drop = FALSE) %>% tally %>% complete(sample_id)
    n_cells <- acast(n_cells, paste0(clustering, " ~ sample_id"), value.var = "n", fill = 0)
    if (nrow(n_cells) < nlevels(colData(SCE)[[clustering]])) {
        missing_markers <- which(!(levels(colData(SCE)[[clustering]]) %in% rownames(n_cells)))
        n_cells_tmp <- matrix(0, nrow = length(missing_markers), ncol = ncol(n_cells))
        rownames(n_cells_tmp) <- missing_markers
        n_cells <- rbind(n_cells, n_cells_tmp)
        n_cells <- n_cells[order(as.numeric(rownames(n_cells))), , drop = FALSE]
    }
    n_cells_total <- rowSums(n_cells)
    row_data <- setNames(data.frame(factor(rownames(n_cells), levels = levels(colData(SCE)[[clustering]])), n_cells_total, stringsAsFactors = FALSE),
                         c(clustering, "n_cells"))
    col_data <- metadata(SCE)$experiment_info
    n_cells <- n_cells[, match(col_data$sample_id, colnames(n_cells)), drop = FALSE]
    stopifnot(all(col_data$sample_id == colnames(n_cells)))
    d_counts <- SummarizedExperiment(assays = list(counts = n_cells), 
                                     rowData = row_data,
                                     colData = col_data)
    d_counts
}
```

This function outputs the median signal values from the arcSinh-transformed signal values:
```
# Define the function that replaces calcMedians:
# SCE = The summarized experiment object the counts calculation will be done on.
# clustering = A (meta)clustering (in quotes "") from colnames(colData(SCE)). The cell counts will be calculated for each (meta)cluster.
calcMediansFixed <- function (SCE, clustering = "cluster_id"){
    if (!is(SCE, "SummarizedExperiment")) {
        stop("Data object must be a 'SummarizedExperiment'.")
    }
    if (!(clustering %in% (colnames(colData(SCE))))) {
        stop("Data object does not contain the given clustering. Check colnames(colData(SCE)).")
    }
    is_marker <- rowData(SCE)$marker_class != "none"
    id_type_markers <- (rowData(SCE)$marker_class == "type")[is_marker]
    id_state_markers <- (rowData(SCE)$marker_class == "state")[is_marker]
    assaydata_mx <- as.matrix(assays(SCE)[["exprs"]])
    medians <- vector("list", sum(is_marker))
    marker_names_sub <- as.character(rowData(SCE)$marker_name[is_marker])
    names(medians) <- marker_names_sub
    clus <- colData(SCE)[[clustering]]
    smp <- colData(SCE)$sample_id
    for (i in seq_along(medians)) {
        assaydata_i <- t(assaydata_mx[marker_names_sub[i], , drop = FALSE])
        assaydata_i <- as.data.frame(assaydata_i)
        temp_clus <- data.frame(clus)
        names(temp_clus) <- clustering
        assaydata_i <- cbind(assaydata_i, sample_id = smp, temp_clus)
        colnames(assaydata_i)[1] <- "value"
        med <- assaydata_i %>% group_by(!!rlang::sym(clustering), sample_id, .drop = FALSE) %>% summarize(median = median(value))
        med <- acast(med, paste0(clustering, " ~ sample_id"), value.var = "median", fill = NA)
        if (nrow(med) < nlevels(colData(SCE)[[clustering]])) {
            missing_markers <- which(!(levels(colData(SCE)[[clustering]]) %in% rownames(med)))
            med_tmp <- matrix(NA, nrow = length(missing_markers), ncol = ncol(med))
            rownames(med_tmp) <- missing_markers
            med <- rbind(med, med_tmp)
            med <- med[order(as.numeric(rownames(med))), , drop = FALSE]
        }
        medians[[i]] <- med
    }
    for (i in seq_along(medians)) {
        if (!all(rownames(medians[[i]]) == rownames(medians[[1]]))) {
            stop("Cluster IDs do not match")
        }
        if (!all(colnames(medians[[i]]) == colnames(medians[[1]]))) {
            stop("Sample IDs do not match")
        }
    }
    row_data <- setNames(data.frame(factor(rownames(medians[[1]]), levels = levels(colData(SCE)[[clustering]])), stringsAsFactors = FALSE),
                         c(clustering))
    col_data <- metadata(SCE)$experiment_info
    medians <- lapply(medians, function(m) {
        m[, match(col_data$sample_id, colnames(m)), drop = FALSE]
    })
    stopifnot(all(sapply(medians, function(m) {
        col_data$sample_id == colnames(m)
    })))
    metadata <- list(id_type_markers = id_type_markers, id_state_markers = id_state_markers)
    d_medians <- SummarizedExperiment(assays = medians, rowData = row_data, 
                                      colData = col_data, metadata = metadata)
    d_medians
}
```

This function outputs the median data made from raw signal values:
```
# The previous function outputs the median expression levels as arcSinh-transformed values with an argument of 5 (or which ever number
# you set as the value for the "cofactor" in "prepData"). In the case of CyTOF data, "cofactor = 5", which gives the formula "asinh(x/5)".
# In the following function, "[["exprs"]]" has been replaced by "[["counts"]]". This makes the function output raw, untransformed medians
# of marker expression. Be advised, arcSinh-transformed values should be used for statistical analyses due to being more comparable
# between markers and/or samples, unlike raw values, which can, for instance, vary from 5 to 5000.

# SCE = The summarized experiment object the counts calculation will be done on.
# clustering = A (meta)clustering (in quotes "") from colnames(colData(SCE)). The cell counts will be calculated for each (meta)cluster.
calcMediansFixed_raw <- function (SCE, clustering = "cluster_id"){
    if (!is(SCE, "SummarizedExperiment")) {
        stop("Data object must be a 'SummarizedExperiment'.")
    }
    if (!(clustering %in% (colnames(colData(SCE))))) {
        stop("Data object does not contain the given clustering. Check colnames(colData(SCE)).")
    }
    is_marker <- rowData(SCE)$marker_class != "none"
    id_type_markers <- (rowData(SCE)$marker_class == "type")[is_marker]
    id_state_markers <- (rowData(SCE)$marker_class == "state")[is_marker]
    assaydata_mx <- as.matrix(assays(SCE)[["counts"]])
    medians <- vector("list", sum(is_marker))
    marker_names_sub <- as.character(rowData(SCE)$marker_name[is_marker])
    names(medians) <- marker_names_sub
    clus <- colData(SCE)[[clustering]]
    smp <- colData(SCE)$sample_id
    for (i in seq_along(medians)) {
        assaydata_i <- t(assaydata_mx[marker_names_sub[i], , drop = FALSE])
        assaydata_i <- as.data.frame(assaydata_i)
        temp_clus <- data.frame(clus)
        names(temp_clus) <- clustering
        assaydata_i <- cbind(assaydata_i, sample_id = smp, temp_clus)
        colnames(assaydata_i)[1] <- "value"
        med <- assaydata_i %>% group_by(!!rlang::sym(clustering), sample_id, .drop = FALSE) %>% summarize(median = median(value))
        med <- acast(med, paste0(clustering, " ~ sample_id"), value.var = "median", fill = NA)
        if (nrow(med) < nlevels(colData(SCE)[[clustering]])) {
            missing_markers <- which(!(levels(colData(SCE)[[clustering]]) %in% rownames(med)))
            med_tmp <- matrix(NA, nrow = length(missing_markers), ncol = ncol(med))
            rownames(med_tmp) <- missing_markers
            med <- rbind(med, med_tmp)
            med <- med[order(as.numeric(rownames(med))), , drop = FALSE]
        }
        medians[[i]] <- med
    }
    for (i in seq_along(medians)) {
        if (!all(rownames(medians[[i]]) == rownames(medians[[1]]))) {
            stop("Cluster IDs do not match")
        }
        if (!all(colnames(medians[[i]]) == colnames(medians[[1]]))) {
            stop("Sample IDs do not match")
        }
    }
    row_data <- setNames(data.frame(factor(rownames(medians[[1]]), levels = levels(colData(SCE)[[clustering]])), stringsAsFactors = FALSE),
                         c(clustering))
    col_data <- metadata(SCE)$experiment_info
    medians <- lapply(medians, function(m) {
        m[, match(col_data$sample_id, colnames(m)), drop = FALSE]
    })
    stopifnot(all(sapply(medians, function(m) {
        col_data$sample_id == colnames(m)
    })))
    metadata <- list(id_type_markers = id_type_markers, id_state_markers = id_state_markers)
    d_medians <- SummarizedExperiment(assays = medians, rowData = row_data, 
                                      colData = col_data, metadata = metadata)
    d_medians
}
```

Please let me know if you decide to incorporate them into diffcyt.

Kind regards.
Luka Tandaric

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fixed the calcCounts and calcMedians Functions and Added Option to Choose Which Clustering Is Used #55

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Fixed the calcCounts and calcMedians Functions and Added Option to Choose Which Clustering Is Used #55

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions