Arcadia-Science · austinhpatton · Sep 25, 2023 · Sep 25, 2023 · Sep 25, 2023 · Sep 27, 2023
diff --git a/bin/cogeqc_summarize_ogs.R b/bin/cogeqc_summarize_ogs.R
@@ -60,6 +60,8 @@ og_dir <- args[1]
 # And the minimum number of species for orthogroup phylogenetic inference
 min_spp <- args[2]
 
+sppid_protid_delim <- args[3]
+
 # Read in the annotations.
 annots <- list.files('./', pattern = "cogeqc_annotations.tsv")
 spps <- gsub("_cogeqc_annotations.tsv", "", annots)
@@ -77,27 +79,37 @@ og_stat_dir <- paste0(og_dir, '/Comparative_Genomics_Statistics/')
 # Go ahead and read in the orthogroups file
 orthogroups <- read_orthogroups(og_file)
 
-# Strip trailing text from species name - may not need in full implementation.
-# Names are determined in orthofinder using the file name, so just include
-# the species here.
-orthogroups$Species <- gsub('[.].*', '', orthogroups$Species)
+# Update the species and gene in this table to correspond to the 
+# species ID and gene ID we are using throughout.
+# NOTE: If using ":" as the species/protein ID delimiter, orthofinder will 
+# replace this with a "_". This means we need to split on the last "_" 
+# given that underscores may have been used in the species ID.
+if(sppid_protid_delim == ":"){
+    orthogroups$Species <- gsub('_[^_]*$', '', orthogroups$Gene)
+    orthogroups$Gene <- gsub('.*_', '', orthogroups$Gene)
+}else{ 
+    # Otherwise, use the specific delimiter that the user provided
+    orthogroups$Species <- gsub(paste0(sppid_protid_delim, '.*'), '', orthogroups$Gene)
+    orthogroups$Gene <- gsub(paste0('.*', sppid_protid_delim), '', orthogroups$Gene)
+}
+
 
 # Get the complete list of species included here
 all_species <- unique(orthogroups$Species)
 
 # Remove any orthogroup members for which we do not have annotations
 orthogroups <- orthogroups[which(orthogroups$Species %in% spps),]
 
+# As before, make sure we pull out the uniprot annotations from between the 
+# pipes in the case that the data were preprocessed using our snakemake workflow
+orthogroups$Gene <- gsub("^[^|]*\\|", "", orthogroups$Gene) %>% gsub("\\|.*$", "", .)
+
 # Pull out the list of species - we are only running this script on a
 # on a subset of species, so we want to be sure that we're not reading in
 # annotations for species other than those we're testing inflation
 # parameters with.
 species <- unique(orthogroups$Species)
 
-# and remove the Species name from the gene name - this will create issues when
-# pairing with the annotations.
-orthogroups$Gene <- gsub('^(?:[^_]*_)*\\s*(.*)', '\\1', orthogroups$Gene)
-
 # Initialize
 interpro <- list()
 oma <- list()
@@ -156,16 +168,23 @@ og_assess_list <- list(
 
 # A quick function to run the assessment in parallel, checking that there are
 # enough species
-get_assessments <-
-    function(i){
-        if(og_assess_list[[i]]$spp_count > 1){
-            assess <- assess_orthogroups(og_assess_list[[i]]$og_set, og_assess_list[[i]]$ann_set)
-            assess <- mean(assess$Mean_score)
-        }else{
-            assess <- NA
-        }
-        return(assess)
+get_assessments <- function(i) {
+    assess <- NA  # Initialize assess with NA
+    if (og_assess_list[[i]]$spp_count > 1) {
+        # Attempt to execute the following block of code
+        tryCatch({
+            assess_temp <- assess_orthogroups(og_assess_list[[i]]$og_set, og_assess_list[[i]]$ann_set)
+            # Only calculate the mean if assess_temp is not NULL
+            if (!is.null(assess_temp)) {
+                assess <- mean(assess_temp$Mean_score)
+            }
+        }, error = function(e) {
+            # If there's an error, catch it and keep assess as NA
+            warning(paste("Error in get_assessments at index", i, ":", e$message))
+        })
     }
+    return(assess)
+}
 
 # First identify for which we have enough species
 target_anns <- which(c(length(interpro), length(oma)) > 1)

diff --git a/conf/base.config b/conf/base.config
@@ -76,14 +76,14 @@ process {
         time   = { check_max( 6.h   * task.attempt, 'time'    ) }
     }
     withLabel:process_witch {
-        cpus   = { check_max( 16                  , 'cpus'    ) }
-        memory = { check_max( 32.GB * task.attempt, 'memory'  ) }
+        cpus   = { check_max( 16    * task.attempt, 'cpus'    ) }
+        memory = { check_max( 64.GB * task.attempt, 'memory'  ) }
         time   = { check_max( 6.h   * task.attempt, 'time'    ) }
     }
     withLabel:process_iqtree {
-        cpus   = { check_max( 6    * task.attempt                , 'cpus'    ) }
-        memory = { check_max( 12.GB * task.attempt                , 'memory'  ) }
-        time   = { check_max( 3.h * Math.pow(3, task.attempt - 1), 'time'    ) }
+        cpus   = { check_max( 8    * task.attempt                , 'cpus'    ) }
+        memory = { check_max( 12.GB * task.attempt               , 'memory'  ) }
+        time   = { check_max( 9.h * Math.pow(3, task.attempt - 1), 'time'    ) }
     }
     withLabel:process_fasttree {
         cpus   = { check_max( 16    * task.attempt                , 'cpus'    ) }

diff --git a/modules/local/annotate_uniprot.nf b/modules/local/annotate_uniprot.nf
@@ -24,18 +24,25 @@ process ANNOTATE_UNIPROT {
     task.ext.when == null || task.ext.when
 
     script:
-    def args        = task.ext.args ?: ''
-    def spp         = "${meta.id}"
-    def is_uniprot  = "${meta.uniprot}"
-    def project_dir = "${projectDir}"
+    def args               = task.ext.args ?: ''
+    def spp                = "${meta.id}"
+    def is_uniprot         = "${meta.uniprot}"
+    def project_dir        = "${projectDir}"
+    def sppid_protid_delim = "${params.sppid_protid_delim}"
     """
     # Only annotate species for which protein IDs are found in UniProt (i.e.
     # proteomes come from UniProt).
     # Check below - if from uniprot, go ahead and annotate, otherwise skip the species.
     if [ "$is_uniprot" == "true" ]; then
         # Pull out the sequence names, strip trailing info, and remove spp name.
-        grep ">" $fasta | cut -d" " -f1 | cut -d":" -f2 > ${spp}_protein_accessions.txt
-
+        grep ">" $fasta | cut -d" " -f1 | cut -d"$sppid_protid_delim" -f2 > ${spp}_protein_accessions.txt
+
+        # If the data was pre-processed using our snakemake workflow, the uniprot
+        # accessions will actually be nested between to pipes ("|") - for example:
+        # "tr|UNIPROTID|UNIPROTID-SppAbbrev"
+        # Make sure that the file containing protein accessions have pulled these out
+        cut -f2 -d"|" ${spp}_protein_accessions.txt > tmp && mv tmp ${spp}_protein_accessions.txt
+
         # Now run the script to pull down annotations for the protein accessions in this species.
         # This Python script uses the bioservices python package to accomplish this.
         # NOTE: The script is packaged in the bin/ subdirectory of this workflow.

diff --git a/modules/local/asteroid.nf b/modules/local/asteroid.nf
@@ -31,27 +31,33 @@ process ASTEROID {
     // always gets set as the file itself, excluding the path
     script:
     def args = task.ext.args ?: ''
+    def sppid_protid_delim = "${params.sppid_protid_delim}"
     """
-    # Generate new protein names to more readily delimit species/protein ids
-    echo "$species_names" | sed "s/\\[//g" | sed "s/\\]//g" | tr "," "\\n" > original_spp_names.txt
-    sed "s/_/-/g" original_spp_names.txt | sed "s/ //g" > new_spp_names.txt
-    paste original_spp_names.txt new_spp_names.txt > spp_rename.txt
-    rm original_spp_names.txt new_spp_names.txt
-
     # Create the list of gene family trees to be decomposed into single-copy
     # trees using DISCO
     cat *.treefile >> gene_family_trees.newick
-
-    # Use the updated species names to update protein names in these gene family
-    # trees, ensuring that underscores in names successfully delimit protein ids
-    # Combine these, and update names in the tree file to match these:
-    while read spp
-    do
-        old=\$(echo \$spp | cut -f1 -d" ")
-        new=\$(echo \$spp | cut -f2 -d" ")
-        sed -i "s/\${old}/\${new}/g" gene_family_trees.newick
-    done < spp_rename.txt
-
+
+    # If the delimiter between species ID and protein ID is not an underscore (_), 
+    # then make sure that any underscore in the species names is replaced with a dash (-),
+    # and replace that delimiter with an underscore.
+    if [ "$sppid_protid_delim" != "_" ]; then
+        # Generate new protein names to more readily delimit species/protein ids
+        echo "$species_names" | sed "s/\\[//g" | sed "s/\\]//g" | tr "," "\\n" > original_spp_names.txt
+        sed "s/_/-/g" original_spp_names.txt | sed "s/ //g" > new_spp_names.txt
+        paste original_spp_names.txt new_spp_names.txt > spp_rename.txt
+        rm original_spp_names.txt new_spp_names.txt
+
+        # Use the updated species names to update protein names in these gene family
+        # trees, ensuring that underscores in names successfully delimit protein ids
+        # Combine these, and update names in the tree file to match these:
+        while read spp
+        do
+            old=\$(echo \$spp | cut -f1 -d" ")
+            new=\$(echo \$spp | cut -f2 -d" ")
+            sed -i "s/\${old}/\${new}/g" gene_family_trees.newick
+        done < spp_rename.txt
+    fi
+
     # Run DISCO to decompose gene family trees into single-copy trees, rooted to
     # minimize the number of duplications and losses
     python /DISCO/disco.py \\
@@ -66,14 +72,17 @@ process ASTEROID {
     -p asteroid \
     $args
 
-    # Now revert the species names back to their original values in all treefiles:
-    while read spp
-    do
-        old=\$(echo \$spp | cut -f1 -d" ")
-        new=\$(echo \$spp | cut -f2 -d" ")
-        sed -i "s/\${new}/\${old}/g" *.newick
-    done < spp_rename.txt
-
+    # Now, conditionally (based on the protein/species ID delimiter) revert the 
+    # species names back to their original values in all treefiles:
+    if [ "$sppid_protid_delim" != "_" ]; then
+        while read spp
+        do
+            old=\$(echo \$spp | cut -f1 -d" ")
+            new=\$(echo \$spp | cut -f2 -d" ")
+            sed -i "s/\${new}/\${old}/g" *.newick
+        done < spp_rename.txt
+    fi
+
     # Lastly, reroot the tree if user-specified outgroups are provided
     if [[ $outgroups != "none" ]]; then
         reroot_speciestree.R asteroid.bestTree.newick $outgroups

diff --git a/modules/local/cogeqc.nf b/modules/local/cogeqc.nf
@@ -25,12 +25,13 @@ process COGEQC {
 
     // always gets set as the file itself, excluding the path
     script:
+    def sppid_protid_delim = "${params.sppid_protid_delim}"
     """
     # Assess orthogroups inferred using each inflation parameter, summarizing
     # how well they group proteins with the same domains together, as well as
     # other summary stats like number of ogs with >= the minimum # species,
     # per-species gene count per-og, etc.
-    cogeqc_summarize_ogs.R ${orthofinder_outdir} ${min_spp}
+    cogeqc_summarize_ogs.R ${orthofinder_outdir} ${min_spp} "${sppid_protid_delim}"
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         cogeqc: \$( cat version.txt | head -n1 | sed "s/\\[1] ‘//g" | sed "s/’//g" )

diff --git a/modules/local/orthofinder_prep.nf b/modules/local/orthofinder_prep.nf
@@ -23,14 +23,29 @@ process ORTHOFINDER_PREP {
     path "versions.yml"      , emit: versions
 
     script:
+    def sppid_protid_delim = "${params.sppid_protid_delim}"
     """
-    # The fasta directory depends on whether we're running the mcl testing or not.
-    orthofinder \\
-        -f ./ \\
-        -t ${task.cpus} \\
-        -op > tmp
-
-    mkdir ${output_directory} && mv OrthoFinder/ ${output_directory}
+    # Make sure the delimiter is actually present (once) in the sequence ID
+    # Use a representative fasta file to find out
+    rep_fa=\$(ls ./*.fa* | head -n1)
+    seqid=\$(head -n1 \$rep_fa | cut -f1 -d" ")
+    in_seqid=\$(echo \$seqid | awk -F'$sppid_protid_delim' '{print NF-1}')
+
+    # If the delimiter is not actually in the seqid, stop the workflow.
+    if [ \$in_seqid == 0 ]; then
+        echo "ERROR: The species and protein ID delimiter is not present in the sequence IDs!"
+        echo "Double check that you are using the correct delimiter for your data as specified using the 'sppid_protid_delim' workflow parameter. This delimiter must only occur once, separating the species ID and protein ID" 
+        echo "You are using the the following delimiter: '$sppid_protid_delim'"
+        exit 1
+    else
+        # The fasta directory depends on whether we're running the mcl testing or not.
+        orthofinder \\
+            -f ./ \\
+            -t ${task.cpus} \\
+            -op > tmp
+
+        mkdir ${output_directory} && mv OrthoFinder/ ${output_directory}
+    fi
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/witch.nf b/modules/local/witch.nf
@@ -32,6 +32,14 @@ process WITCH {
     def og      = "${meta.og}"
     def min_len = params.min_ungapped_length ?: '20'
     """
+    # Scale down the number of threads allocated to MAFFT based on the number of
+    # task attempts (i.e. reduce the memory usage of MAFFT if MAGUS keeps 
+    # running out of memory for this reason). Will slow down MAGUS somewhat, 
+    # but will increase the chance of the process completing successfully. 
+    mafft_threads=\$(( 9 - ${task.attempt} ))
+    sed -i "s/numthreadsit=8/numthreadsit=\$mafft_threads/g" /WITCH/*/*/*/*/*/bin/mafft 
+    sed -i "s/numthreads -lt 8/numthreads -lt \$mafft_threads/g" /WITCH/*/*/*/*/*/bin/mafft 
+
     # If we are resuming a run, do some cleanup:
     if [ -d "alignments/" ]; then
         rm -rf alignments/

diff --git a/nextflow.config b/nextflow.config
@@ -12,6 +12,9 @@ params {
     // TODO nf-core: Specify your pipeline's command line flags
     // Input options
     input                      = null
+
+    // Delimeter used to separate the species and protein ID
+    sppid_protid_delim         = '_'
 
     // OrthoFinder MCL inflation parameters to test
     mcl_inflation              = '1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0'

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -30,6 +30,11 @@
                     "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file.",
                     "fa_icon": "fas fa-file-csv"
                 },
+                "sppid_protid_delim": {
+                    "type": "string",
+                    "default": "_",
+                    "description": "Option specifying the character that delimits the unique species and protein IDs. This delimiter MUST NOT occur within either the species ID or any protein ID."
+                },
                 "mcl_inflation": {
                     "type": "string",
                     "description": "Array of MCL inflation values, expressed as a comma-separated string (ie 1.5,3.0,4.5), or a single numerical value (ie 1.5).",