Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
24fbfcb
add spp/protein id delimiter parameter
austinhpatton Sep 25, 2023
9c75943
add species/protein ID delimiter param to schema
austinhpatton Sep 25, 2023
eb5cd62
use param to split spp and prot ids
austinhpatton Sep 25, 2023
d9e898e
make var names consistent
austinhpatton Sep 27, 2023
e8a280f
conditionally rename species based on delimiter
austinhpatton Sep 27, 2023
1f431e2
conditionally rename species based on delimiter
austinhpatton Sep 27, 2023
63624bc
meoved trailing quotation
austinhpatton Sep 27, 2023
a289ee0
update to flexibly use spp/prot id delimiter
austinhpatton Sep 28, 2023
10631e1
fix typo
austinhpatton Sep 28, 2023
c98d07d
fix typo
austinhpatton Sep 28, 2023
17d0c41
move treefile creation out of if statement
austinhpatton Sep 28, 2023
4d16536
Merge branch 'main' of github.com:Arcadia-Science/phylorthology into …
austinhpatton Sep 28, 2023
922b45d
delimit spp/prot ids using user-specified delimiter
austinhpatton Oct 2, 2023
52dd39c
provide user-specified spp/prot ID delimiter param to cogeqc Rscript
austinhpatton Oct 2, 2023
bb4cb52
add check to see if the sppid/protid delimiter is in the sequence IDs…
austinhpatton Oct 2, 2023
d75d945
fix to schema file for nf-core compatibility
austinhpatton Oct 6, 2023
7abf125
Merge branch 'main' of github.com:Arcadia-Science/phylorthology into …
austinhpatton Oct 6, 2023
c85db4a
pull out UniProt accessions from snakemake headers
austinhpatton Oct 6, 2023
7e51613
skip OMA annotations without error as needed
austinhpatton Oct 6, 2023
ceca2d3
downscale mafft threaduse on retry
austinhpatton Oct 9, 2023
205075e
slight update to process specs
austinhpatton Oct 9, 2023
702e262
fixed path specification
austinhpatton Oct 9, 2023
38e263e
update iqtree process specs
austinhpatton Oct 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 36 additions & 17 deletions bin/cogeqc_summarize_ogs.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ og_dir <- args[1]
# And the minimum number of species for orthogroup phylogenetic inference
min_spp <- args[2]

sppid_protid_delim <- args[3]

# Read in the annotations.
annots <- list.files('./', pattern = "cogeqc_annotations.tsv")
spps <- gsub("_cogeqc_annotations.tsv", "", annots)
Expand All @@ -77,27 +79,37 @@ og_stat_dir <- paste0(og_dir, '/Comparative_Genomics_Statistics/')
# Go ahead and read in the orthogroups file
orthogroups <- read_orthogroups(og_file)

# Strip trailing text from species name - may not need in full implementation.
# Names are determined in orthofinder using the file name, so just include
# the species here.
orthogroups$Species <- gsub('[.].*', '', orthogroups$Species)
# Update the species and gene in this table to correspond to the
# species ID and gene ID we are using throughout.
# NOTE: If using ":" as the species/protein ID delimiter, orthofinder will
# replace this with a "_". This means we need to split on the last "_"
# given that underscores may have been used in the species ID.
if(sppid_protid_delim == ":"){
orthogroups$Species <- gsub('_[^_]*$', '', orthogroups$Gene)
orthogroups$Gene <- gsub('.*_', '', orthogroups$Gene)
}else{
# Otherwise, use the specific delimiter that the user provided
orthogroups$Species <- gsub(paste0(sppid_protid_delim, '.*'), '', orthogroups$Gene)
orthogroups$Gene <- gsub(paste0('.*', sppid_protid_delim), '', orthogroups$Gene)
}


# Get the complete list of species included here
all_species <- unique(orthogroups$Species)

# Remove any orthogroup members for which we do not have annotations
orthogroups <- orthogroups[which(orthogroups$Species %in% spps),]

# As before, make sure we pull out the uniprot annotations from between the
# pipes in the case that the data were preprocessed using our snakemake workflow
orthogroups$Gene <- gsub("^[^|]*\\|", "", orthogroups$Gene) %>% gsub("\\|.*$", "", .)

# Pull out the list of species - we are only running this script on a
# on a subset of species, so we want to be sure that we're not reading in
# annotations for species other than those we're testing inflation
# parameters with.
species <- unique(orthogroups$Species)

# and remove the Species name from the gene name - this will create issues when
# pairing with the annotations.
orthogroups$Gene <- gsub('^(?:[^_]*_)*\\s*(.*)', '\\1', orthogroups$Gene)

# Initialize
interpro <- list()
oma <- list()
Expand Down Expand Up @@ -156,16 +168,23 @@ og_assess_list <- list(

# A quick function to run the assessment in parallel, checking that there are
# enough species
get_assessments <-
function(i){
if(og_assess_list[[i]]$spp_count > 1){
assess <- assess_orthogroups(og_assess_list[[i]]$og_set, og_assess_list[[i]]$ann_set)
assess <- mean(assess$Mean_score)
}else{
assess <- NA
}
return(assess)
get_assessments <- function(i) {
assess <- NA # Initialize assess with NA
if (og_assess_list[[i]]$spp_count > 1) {
# Attempt to execute the following block of code
tryCatch({
assess_temp <- assess_orthogroups(og_assess_list[[i]]$og_set, og_assess_list[[i]]$ann_set)
# Only calculate the mean if assess_temp is not NULL
if (!is.null(assess_temp)) {
assess <- mean(assess_temp$Mean_score)
}
}, error = function(e) {
# If there's an error, catch it and keep assess as NA
warning(paste("Error in get_assessments at index", i, ":", e$message))
})
}
return(assess)
}

# First identify for which we have enough species
target_anns <- which(c(length(interpro), length(oma)) > 1)
Expand Down
10 changes: 5 additions & 5 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,14 @@ process {
time = { check_max( 6.h * task.attempt, 'time' ) }
}
withLabel:process_witch {
cpus = { check_max( 16 , 'cpus' ) }
memory = { check_max( 32.GB * task.attempt, 'memory' ) }
cpus = { check_max( 16 * task.attempt, 'cpus' ) }
memory = { check_max( 64.GB * task.attempt, 'memory' ) }
time = { check_max( 6.h * task.attempt, 'time' ) }
}
withLabel:process_iqtree {
cpus = { check_max( 6 * task.attempt , 'cpus' ) }
memory = { check_max( 12.GB * task.attempt , 'memory' ) }
time = { check_max( 3.h * Math.pow(3, task.attempt - 1), 'time' ) }
cpus = { check_max( 8 * task.attempt , 'cpus' ) }
memory = { check_max( 12.GB * task.attempt , 'memory' ) }
time = { check_max( 9.h * Math.pow(3, task.attempt - 1), 'time' ) }
}
withLabel:process_fasttree {
cpus = { check_max( 16 * task.attempt , 'cpus' ) }
Expand Down
19 changes: 13 additions & 6 deletions modules/local/annotate_uniprot.nf
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,25 @@ process ANNOTATE_UNIPROT {
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def spp = "${meta.id}"
def is_uniprot = "${meta.uniprot}"
def project_dir = "${projectDir}"
def args = task.ext.args ?: ''
def spp = "${meta.id}"
def is_uniprot = "${meta.uniprot}"
def project_dir = "${projectDir}"
def sppid_protid_delim = "${params.sppid_protid_delim}"
"""
# Only annotate species for which protein IDs are found in UniProt (i.e.
# proteomes come from UniProt).
# Check below - if from uniprot, go ahead and annotate, otherwise skip the species.
if [ "$is_uniprot" == "true" ]; then
# Pull out the sequence names, strip trailing info, and remove spp name.
grep ">" $fasta | cut -d" " -f1 | cut -d":" -f2 > ${spp}_protein_accessions.txt

grep ">" $fasta | cut -d" " -f1 | cut -d"$sppid_protid_delim" -f2 > ${spp}_protein_accessions.txt

# If the data was pre-processed using our snakemake workflow, the uniprot
# accessions will actually be nested between to pipes ("|") - for example:
# "tr|UNIPROTID|UNIPROTID-SppAbbrev"
# Make sure that the file containing protein accessions have pulled these out
cut -f2 -d"|" ${spp}_protein_accessions.txt > tmp && mv tmp ${spp}_protein_accessions.txt

# Now run the script to pull down annotations for the protein accessions in this species.
# This Python script uses the bioservices python package to accomplish this.
# NOTE: The script is packaged in the bin/ subdirectory of this workflow.
Expand Down
59 changes: 34 additions & 25 deletions modules/local/asteroid.nf
Original file line number Diff line number Diff line change
Expand Up @@ -31,27 +31,33 @@ process ASTEROID {
// always gets set as the file itself, excluding the path
script:
def args = task.ext.args ?: ''
def sppid_protid_delim = "${params.sppid_protid_delim}"
"""
# Generate new protein names to more readily delimit species/protein ids
echo "$species_names" | sed "s/\\[//g" | sed "s/\\]//g" | tr "," "\\n" > original_spp_names.txt
sed "s/_/-/g" original_spp_names.txt | sed "s/ //g" > new_spp_names.txt
paste original_spp_names.txt new_spp_names.txt > spp_rename.txt
rm original_spp_names.txt new_spp_names.txt

# Create the list of gene family trees to be decomposed into single-copy
# trees using DISCO
cat *.treefile >> gene_family_trees.newick

# Use the updated species names to update protein names in these gene family
# trees, ensuring that underscores in names successfully delimit protein ids
# Combine these, and update names in the tree file to match these:
while read spp
do
old=\$(echo \$spp | cut -f1 -d" ")
new=\$(echo \$spp | cut -f2 -d" ")
sed -i "s/\${old}/\${new}/g" gene_family_trees.newick
done < spp_rename.txt


# If the delimiter between species ID and protein ID is not an underscore (_),
# then make sure that any underscore in the species names is replaced with a dash (-),
# and replace that delimiter with an underscore.
if [ "$sppid_protid_delim" != "_" ]; then
# Generate new protein names to more readily delimit species/protein ids
echo "$species_names" | sed "s/\\[//g" | sed "s/\\]//g" | tr "," "\\n" > original_spp_names.txt
sed "s/_/-/g" original_spp_names.txt | sed "s/ //g" > new_spp_names.txt
paste original_spp_names.txt new_spp_names.txt > spp_rename.txt
rm original_spp_names.txt new_spp_names.txt

# Use the updated species names to update protein names in these gene family
# trees, ensuring that underscores in names successfully delimit protein ids
# Combine these, and update names in the tree file to match these:
while read spp
do
old=\$(echo \$spp | cut -f1 -d" ")
new=\$(echo \$spp | cut -f2 -d" ")
sed -i "s/\${old}/\${new}/g" gene_family_trees.newick
done < spp_rename.txt
fi

# Run DISCO to decompose gene family trees into single-copy trees, rooted to
# minimize the number of duplications and losses
python /DISCO/disco.py \\
Expand All @@ -66,14 +72,17 @@ process ASTEROID {
-p asteroid \
$args

# Now revert the species names back to their original values in all treefiles:
while read spp
do
old=\$(echo \$spp | cut -f1 -d" ")
new=\$(echo \$spp | cut -f2 -d" ")
sed -i "s/\${new}/\${old}/g" *.newick
done < spp_rename.txt

# Now, conditionally (based on the protein/species ID delimiter) revert the
# species names back to their original values in all treefiles:
if [ "$sppid_protid_delim" != "_" ]; then
while read spp
do
old=\$(echo \$spp | cut -f1 -d" ")
new=\$(echo \$spp | cut -f2 -d" ")
sed -i "s/\${new}/\${old}/g" *.newick
done < spp_rename.txt
fi

# Lastly, reroot the tree if user-specified outgroups are provided
if [[ $outgroups != "none" ]]; then
reroot_speciestree.R asteroid.bestTree.newick $outgroups
Expand Down
3 changes: 2 additions & 1 deletion modules/local/cogeqc.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,13 @@ process COGEQC {

// always gets set as the file itself, excluding the path
script:
def sppid_protid_delim = "${params.sppid_protid_delim}"
"""
# Assess orthogroups inferred using each inflation parameter, summarizing
# how well they group proteins with the same domains together, as well as
# other summary stats like number of ogs with >= the minimum # species,
# per-species gene count per-og, etc.
cogeqc_summarize_ogs.R ${orthofinder_outdir} ${min_spp}
cogeqc_summarize_ogs.R ${orthofinder_outdir} ${min_spp} "${sppid_protid_delim}"
cat <<-END_VERSIONS > versions.yml
"${task.process}":
cogeqc: \$( cat version.txt | head -n1 | sed "s/\\[1] ‘//g" | sed "s/’//g" )
Expand Down
29 changes: 22 additions & 7 deletions modules/local/orthofinder_prep.nf
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,29 @@ process ORTHOFINDER_PREP {
path "versions.yml" , emit: versions

script:
def sppid_protid_delim = "${params.sppid_protid_delim}"
"""
# The fasta directory depends on whether we're running the mcl testing or not.
orthofinder \\
-f ./ \\
-t ${task.cpus} \\
-op > tmp

mkdir ${output_directory} && mv OrthoFinder/ ${output_directory}
# Make sure the delimiter is actually present (once) in the sequence ID
# Use a representative fasta file to find out
rep_fa=\$(ls ./*.fa* | head -n1)
seqid=\$(head -n1 \$rep_fa | cut -f1 -d" ")
in_seqid=\$(echo \$seqid | awk -F'$sppid_protid_delim' '{print NF-1}')

# If the delimiter is not actually in the seqid, stop the workflow.
if [ \$in_seqid == 0 ]; then
echo "ERROR: The species and protein ID delimiter is not present in the sequence IDs!"
echo "Double check that you are using the correct delimiter for your data as specified using the 'sppid_protid_delim' workflow parameter. This delimiter must only occur once, separating the species ID and protein ID"
echo "You are using the the following delimiter: '$sppid_protid_delim'"
exit 1
else
# The fasta directory depends on whether we're running the mcl testing or not.
orthofinder \\
-f ./ \\
-t ${task.cpus} \\
-op > tmp

mkdir ${output_directory} && mv OrthoFinder/ ${output_directory}
fi

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
8 changes: 8 additions & 0 deletions modules/local/witch.nf
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ process WITCH {
def og = "${meta.og}"
def min_len = params.min_ungapped_length ?: '20'
"""
# Scale down the number of threads allocated to MAFFT based on the number of
# task attempts (i.e. reduce the memory usage of MAFFT if MAGUS keeps
# running out of memory for this reason). Will slow down MAGUS somewhat,
# but will increase the chance of the process completing successfully.
mafft_threads=\$(( 9 - ${task.attempt} ))
sed -i "s/numthreadsit=8/numthreadsit=\$mafft_threads/g" /WITCH/*/*/*/*/*/bin/mafft
sed -i "s/numthreads -lt 8/numthreads -lt \$mafft_threads/g" /WITCH/*/*/*/*/*/bin/mafft

# If we are resuming a run, do some cleanup:
if [ -d "alignments/" ]; then
rm -rf alignments/
Expand Down
3 changes: 3 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ params {
// TODO nf-core: Specify your pipeline's command line flags
// Input options
input = null

// Delimeter used to separate the species and protein ID
sppid_protid_delim = '_'

// OrthoFinder MCL inflation parameters to test
mcl_inflation = '1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0'
Expand Down
5 changes: 5 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@
"help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file.",
"fa_icon": "fas fa-file-csv"
},
"sppid_protid_delim": {
"type": "string",
"default": "_",
"description": "Option specifying the character that delimits the unique species and protein IDs. This delimiter MUST NOT occur within either the species ID or any protein ID."

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it worth throwing an error if the delimiter is already in the string (e.g., if we find it twice in the string)?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm, it could be - alternatively, would it be worth just internally rename things in some consistent way if we catch these exceptions?

},
"mcl_inflation": {
"type": "string",
"description": "Array of MCL inflation values, expressed as a comma-separated string (ie 1.5,3.0,4.5), or a single numerical value (ie 1.5).",
Expand Down