From 372b51210bcb3501b09b61d3aa23530a70e2096c Mon Sep 17 00:00:00 2001 From: jvfe Date: Mon, 22 Jul 2024 14:40:20 -0300 Subject: [PATCH 1/9] feat: Add feature dispersion script and module --- bin/Feature_Dispersion.py | 294 ++++++++++++++++++++++++ modules/local/featuredispersion/main.nf | 36 +++ 2 files changed, 330 insertions(+) create mode 100755 bin/Feature_Dispersion.py create mode 100644 modules/local/featuredispersion/main.nf diff --git a/bin/Feature_Dispersion.py b/bin/Feature_Dispersion.py new file mode 100755 index 0000000..10c265e --- /dev/null +++ b/bin/Feature_Dispersion.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python + +import argparse +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from ete3 import Tree + + +def round_to_sig_figs(value, sig_figs): + if value == 0: + return 0 + return round(value, sig_figs - int(np.floor(np.log10(abs(value)))) - 1) + + +def calculate_phylogenetic_diversity(tree): + sum_branch_lengths = 0.0 + for node in tree.traverse(): + sum_branch_lengths += node.dist + return sum_branch_lengths + + +def calculate_feature_counts(feature_file): + feature_df = pd.read_csv(feature_file, sep="\t", index_col=0) + feature_df.reset_index(inplace=True) + feature_df.rename(columns={"index": "genome"}, inplace=True) + + # Initialize lists to store data for the new dataframe + features = [] + total_counts = [] + genomes_list = [] + + # Iterate over each feature column + for feature_col in feature_df.columns[1:]: # Exclude 'genome' column + # Collect the genomes where this feature is present into a comma-separated list + genomes_with_feature = feature_df[feature_df[feature_col] == 1][ + "genome" + ].tolist() + genomes_str = ",".join(genomes_with_feature) + + # Calculate the total count of this feature across all genomes + total_count_feature = feature_df[feature_col].sum() + + # Append data to lists + features.append(feature_col) + total_counts.append(total_count_feature) + genomes_list.append(genomes_str) + + # Create a new dataframe from the collected data + features_df = pd.DataFrame( + {"feature": features, "total_count": total_counts, "genomes_list": genomes_list} + ) + + return features_df + + +def verify_genome_ids(tree, feature_file, samplesheet_file=None): + feature_df = pd.read_csv(feature_file, sep="\t", index_col=0) + feature_genomes = set(feature_df.index) + + if samplesheet_file: + samplesheet_df = pd.read_csv( + samplesheet_file, + sep="\t", + usecols=[0], + header=0, + names=["genome"], + skiprows=1, + ) + samplesheet_genomes = set(samplesheet_df["genome"]) + else: + samplesheet_genomes = set() + + tree_genomes = set(tree.get_leaf_names()) + + missing_in_tree = feature_genomes.union(samplesheet_genomes) - tree_genomes + + if missing_in_tree: + print( + f"Error: The following genome IDs are missing in the phylogenetic tree: {', '.join(missing_in_tree)}" + ) + exit(1) + + +def generate_heatmap(output_df, output_heatmap): + bins = np.arange(0, 1.1, 0.1) + max_genome_count = output_df["Genome Count"].max() + genome_bins = np.linspace( + 0, max_genome_count, 11 + ) # Generate 11 edges to create 10 bins + genome_bins_labels = [ + f"{int(genome_bins[i]) + 1}-{int(genome_bins[i + 1])}" + for i in range(len(genome_bins) - 1) + ] + + heatmap_data = pd.DataFrame(0, index=genome_bins_labels, columns=bins) + + for index, row in output_df.iterrows(): + pd_ratio = row["PD Ratio"] + genome_count = row["Genome Count"] + bin_idx = np.digitize(pd_ratio, bins) - 1 + genome_bin_idx = np.digitize(genome_count, genome_bins) - 1 + + # Ensure the indices are within the valid range + bin_idx = min(bin_idx, len(bins) - 1) + genome_bin_idx = min(genome_bin_idx, len(genome_bins_labels) - 1) + + heatmap_data.iloc[genome_bin_idx, bin_idx] += 1 + + plt.figure(figsize=(12, 8)) + sns.heatmap( + np.log1p(heatmap_data), + cmap="Reds", + cbar_kws={"label": "Number of Features (log scale)"}, + annot=heatmap_data, + fmt="g", + linewidths=0.5, + ) + plt.xlabel("PD Ratio Bins") + plt.ylabel("Genome Count Bins") + plt.title("Heatmap of Features by PD Ratio and Genome Count") + plt.xticks(ticks=np.arange(0.5, len(bins), 1), labels=np.round(bins, 1)) + plt.yticks( + ticks=np.arange(0.5, len(genome_bins_labels), 1), labels=genome_bins_labels + ) + plt.gca().invert_yaxis() + plt.savefig(output_heatmap) + plt.close() + + +def main( + tree_file, + feature_file, + output_base, + samplesheet_file=None, + samplesheet_columns=None, +): + ref_tree = Tree(tree_file) + + # Verify genome IDs + verify_genome_ids(ref_tree, feature_file, samplesheet_file) + + # Calculate phylogenetic diversity + total_diversity = calculate_phylogenetic_diversity(ref_tree) + + # Calculate feature counts + feature_distr = calculate_feature_counts(feature_file) + + # Read samplesheet if provided + if samplesheet_file: + samplesheet_df = pd.read_csv(samplesheet_file, sep="\t", header=0) + available_columns = set(samplesheet_df.columns) + if "genome" not in available_columns: + print("Error: 'genome' column is missing in the samplesheet.") + exit(1) + + samplesheet_data = {} + for column in samplesheet_columns: + if column in available_columns: + samplesheet_data[column] = ( + samplesheet_df[["genome", column]] + .set_index("genome")[column] + .to_dict() + ) + else: + print( + f"Warning: Column '{column}' not found in the samplesheet. Skipping." + ) + else: + samplesheet_data = {} + + # Create an empty DataFrame to store the output + output_columns = [ + "Feature Name", + "Total PD", + "Projected PD", + "PD Ratio", + "Genome Count", + "PD Ratio / Genome Count", + ] + + # Add columns for each requested samplesheet column + for column in samplesheet_columns: + output_columns.append(f"{column} Distinct Values") + output_columns.append(f"PD Ratio / {column} Values") + + output_df = pd.DataFrame(columns=output_columns) + + # Iterate over each feature in feature_distr + for index, row in feature_distr.iterrows(): + feature_name = row["feature"] + genomes_list = row["genomes_list"].split(",") + genome_count = row["total_count"] + + # Only proceed if the feature is present in more than one genome + if len(genomes_list) > 1: + # Generate a list of genomes to keep (those that have the feature) + genomes_to_keep = [genome for genome in genomes_list if genome in ref_tree] + + # Create a copy of the original tree with only the relevant genomes + projected_tree = ref_tree.copy() + projected_tree.prune(genomes_to_keep) + + # Calculate phylogenetic diversity of the projected tree + projected_diversity = calculate_phylogenetic_diversity(projected_tree) + + # Calculate the ratio of projected diversity to total diversity + ratio_diversity = projected_diversity / total_diversity + + # Calculate the ratio of projected phylogenetic diversity to total count of genomes + genome_ratio_phylogenetic_diversity = ratio_diversity / genome_count + + # Prepare the row for output + output_row = [ + feature_name, + round_to_sig_figs(total_diversity, 4), + round_to_sig_figs(projected_diversity, 4), + round_to_sig_figs(ratio_diversity, 4), + round_to_sig_figs(genome_count, 4), + round_to_sig_figs(genome_ratio_phylogenetic_diversity, 4), + ] + + # Add values for each requested samplesheet column + for column in samplesheet_columns: + if column in samplesheet_data: + # Identify distinct values for the column + distinct_values = set( + samplesheet_data[column].get(genome, None) + for genome in genomes_list + if genome in samplesheet_data[column] + ) + distinct_values.discard(None) + V = len(distinct_values) + PD_ratio_per_V = ratio_diversity / V if V > 0 else 0 + output_row.extend([V, round_to_sig_figs(PD_ratio_per_V, 4)]) + else: + output_row.extend([None, None]) + + # Add the row to the output DataFrame + output_df.loc[index] = output_row + + output_df_sorted = output_df.sort_values(by="PD Ratio / Genome Count") + + # Save the output dataframe to a TSV file + output_df_sorted.to_csv(f"{output_base}.tsv", sep="\t", index=False) + + # Generate the heatmap + generate_heatmap(output_df_sorted, f"{output_base}.png") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Calculate feature statistics based on phylogenetic tree and genus information." + ) + parser.add_argument( + "--tree_file", type=str, required=True, help="Path to the Newick tree file" + ) + parser.add_argument( + "--feature_file", + type=str, + required=True, + help="Path to the feature presence/absence file", + ) + parser.add_argument( + "--output_base", + type=str, + required=True, + help="Base name for the output files (without extension)", + ) + parser.add_argument( + "--samplesheet_file", + type=str, + help="Path to the file mapping genome IDs to other properties", + ) + parser.add_argument( + "--samplesheet_columns", + type=str, + help="Comma-separated list of columns to process from the samplesheet", + ) + args = parser.parse_args() + + if args.samplesheet_columns: + samplesheet_columns = args.samplesheet_columns.split(",") + else: + samplesheet_columns = [] + + main( + args.tree_file, + args.feature_file, + args.output_base, + args.samplesheet_file, + samplesheet_columns, + ) diff --git a/modules/local/featuredispersion/main.nf b/modules/local/featuredispersion/main.nf new file mode 100644 index 0000000..ab35d68 --- /dev/null +++ b/modules/local/featuredispersion/main.nf @@ -0,0 +1,36 @@ +process FEATURE_DISPERSION { + label 'process_single' + + conda "bioconda::ete3=3.1.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://docker.io/jvfe/rspr:v1.3.7': + 'docker.io/jvfe/rspr:v1.3.7' }" + + input: + path core_tree + path feature_profile + path samplesheet + val samplesheet_columns + + output: + path "FeatureDispersion.tsv", emit: tsv + path "FeatureDispersion.png", emit: png + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + Feature_Dispersion.py \\ + --tree_file $core_tree \\ + --feature_file $feature_profile \\ + --samplesheet_file $samplesheet \\ + --samplesheet_columns $samplesheet_columns + """ + stub: + """ + touch FeatureDispersion.tsv + touch FeatureDispersion.png + """ +} From 2c7ac6fcdfe19ca2ea5f7043c68c1fb45f1fbf7d Mon Sep 17 00:00:00 2001 From: jvfe Date: Mon, 22 Jul 2024 15:03:45 -0300 Subject: [PATCH 2/9] fix: Add missing output base to script --- modules/local/featuredispersion/main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/local/featuredispersion/main.nf b/modules/local/featuredispersion/main.nf index ab35d68..23b1b3d 100644 --- a/modules/local/featuredispersion/main.nf +++ b/modules/local/featuredispersion/main.nf @@ -23,6 +23,7 @@ process FEATURE_DISPERSION { def args = task.ext.args ?: '' """ Feature_Dispersion.py \\ + --output_base FeatureDispersion \\ --tree_file $core_tree \\ --feature_file $feature_profile \\ --samplesheet_file $samplesheet \\ From 427d5161ba1e0ddd97fb1bdb587a00639698c939 Mon Sep 17 00:00:00 2001 From: jvfe Date: Mon, 22 Jul 2024 15:09:43 -0300 Subject: [PATCH 3/9] feat: Integrate feat dispersion into the pipeline --- conf/modules.config | 8 ++++++++ nextflow.config | 1 + subworkflows/local/phylo.nf | 16 ++++++++++++++++ workflows/arete.nf | 16 +++++++++++++--- 4 files changed, 38 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 870b5ae..0b27497 100755 --- a/conf/modules.config +++ b/conf/modules.config @@ -458,6 +458,14 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + + withName: FEATURE_DISPERSION { + publishDir = [ + path: { "${params.outdir}/annotation/feature_dispersion" }, + mode: params.publish_dir_mode, + ] + } + // Recombination withName: VERTICALL_PAIRWISE { ext.prefix = { "cluster_${cluster}" } diff --git a/nextflow.config b/nextflow.config index a20036b..fcabd78 100755 --- a/nextflow.config +++ b/nextflow.config @@ -36,6 +36,7 @@ params { bakta_db = null annotation_tools = 'mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy,report' feature_profile_columns = 'mobsuite,rgi,cazy,vfdb,iceberg,bacmet' + feature_dispersion_columns = null min_pident = 60 min_qcover = 0.6 skip_profile_creation = false diff --git a/subworkflows/local/phylo.nf b/subworkflows/local/phylo.nf index 06caa9c..254b36e 100755 --- a/subworkflows/local/phylo.nf +++ b/subworkflows/local/phylo.nf @@ -15,6 +15,7 @@ include { PPANGGOLIN_MSA } from '../../modules/local/ppanggolin/msa/main' include { GML2GV } from '../../modules/local/graphviz/gml2gv/main' include { GET_SOFTWARE_VERSIONS } from '../../modules/local/get_software_versions' include { CONCAT_ALIGNMENT } from '../../modules/local/concat_alignment' +include { FEATURE_DISPERSION } from '../../modules/local/featuredispersion/main' workflow PHYLOGENOMICS{ @@ -22,6 +23,7 @@ workflow PHYLOGENOMICS{ gffs use_full_alignment use_fasttree + feature_profile main: ch_software_versions = Channel.empty() @@ -117,6 +119,20 @@ workflow PHYLOGENOMICS{ ch_software_versions = ch_software_versions.mix(IQTREE.out.versions.ifEmpty(null)) } + if (feature_profile) { + if (params.feature_dispersion_columns) { + FEATURE_DISPERSION( + core_tree, + feature_profile, + file(params.input_sample_table), + params.feature_dispersion_columns + ) + } else { + FEATURE_DISPERSION(core_tree, feature_profile, [], []) + } + + } + emit: phylo_software = ch_software_versions all_alignments = ch_all_alignments diff --git a/workflows/arete.nf b/workflows/arete.nf index 71c4ed4..6cde691 100755 --- a/workflows/arete.nf +++ b/workflows/arete.nf @@ -222,7 +222,12 @@ workflow ARETE { ////////////////////////// PANGENOME ///////////////////////////////////// if (!params.skip_phylo) { - PHYLOGENOMICS(gffs, use_full_alignment, use_fasttree) + PHYLOGENOMICS( + gffs, + use_full_alignment, + use_fasttree, + ANNOTATE_ASSEMBLIES.out.feature_profile + ) ch_software_versions = ch_software_versions.mix(PHYLOGENOMICS.out.phylo_software) if (params.run_evolccm) { @@ -476,7 +481,12 @@ workflow ANNOTATION { ////////////////////////// PANGENOME ///////////////////////////////////// if (!params.skip_phylo) { - PHYLOGENOMICS(gffs, use_full_alignment, use_fasttree) + PHYLOGENOMICS( + gffs, + use_full_alignment, + use_fasttree, + ANNOTATE_ASSEMBLIES.out.feature_profile + ) ch_software_versions = ch_software_versions.mix(PHYLOGENOMICS.out.phylo_software) if (params.run_evolccm) { @@ -603,7 +613,7 @@ workflow PHYLO { PHYLO_INPUT_CHECK.out.genomes.set { gffs } ////////////////////////// PANGENOME ///////////////////////////////////// - PHYLOGENOMICS(gffs, use_full_alignment, use_fasttree) + PHYLOGENOMICS(gffs, use_full_alignment, use_fasttree, []) ch_software_versions = ch_software_versions.mix(PHYLOGENOMICS.out.phylo_software) From a0f74f444d6f4e196575328673ffd48a6e4e5323 Mon Sep 17 00:00:00 2001 From: jvfe Date: Mon, 22 Jul 2024 15:19:00 -0300 Subject: [PATCH 4/9] refact: Change genome to genome_id --- bin/Feature_Dispersion.py | 179 +++++++----------------- modules/local/featuredispersion/main.nf | 7 +- 2 files changed, 57 insertions(+), 129 deletions(-) diff --git a/bin/Feature_Dispersion.py b/bin/Feature_Dispersion.py index 10c265e..a394e0c 100755 --- a/bin/Feature_Dispersion.py +++ b/bin/Feature_Dispersion.py @@ -7,24 +7,21 @@ import seaborn as sns from ete3 import Tree - def round_to_sig_figs(value, sig_figs): if value == 0: return 0 return round(value, sig_figs - int(np.floor(np.log10(abs(value)))) - 1) - def calculate_phylogenetic_diversity(tree): sum_branch_lengths = 0.0 for node in tree.traverse(): sum_branch_lengths += node.dist return sum_branch_lengths - def calculate_feature_counts(feature_file): - feature_df = pd.read_csv(feature_file, sep="\t", index_col=0) + feature_df = pd.read_csv(feature_file, sep='\t', index_col=0) feature_df.reset_index(inplace=True) - feature_df.rename(columns={"index": "genome"}, inplace=True) + feature_df.rename(columns={'index': 'genome_id'}, inplace=True) # Initialize lists to store data for the new dataframe features = [] @@ -32,12 +29,10 @@ def calculate_feature_counts(feature_file): genomes_list = [] # Iterate over each feature column - for feature_col in feature_df.columns[1:]: # Exclude 'genome' column + for feature_col in feature_df.columns[1:]: # Exclude 'genome_id' column # Collect the genomes where this feature is present into a comma-separated list - genomes_with_feature = feature_df[feature_df[feature_col] == 1][ - "genome" - ].tolist() - genomes_str = ",".join(genomes_with_feature) + genomes_with_feature = feature_df[feature_df[feature_col] == 1]['genome_id'].tolist() + genomes_str = ','.join(genomes_with_feature) # Calculate the total count of this feature across all genomes total_count_feature = feature_df[feature_col].sum() @@ -48,27 +43,21 @@ def calculate_feature_counts(feature_file): genomes_list.append(genomes_str) # Create a new dataframe from the collected data - features_df = pd.DataFrame( - {"feature": features, "total_count": total_counts, "genomes_list": genomes_list} - ) + features_df = pd.DataFrame({ + 'feature': features, + 'total_count': total_counts, + 'genomes_list': genomes_list + }) return features_df - def verify_genome_ids(tree, feature_file, samplesheet_file=None): - feature_df = pd.read_csv(feature_file, sep="\t", index_col=0) + feature_df = pd.read_csv(feature_file, sep='\t', index_col=0) feature_genomes = set(feature_df.index) if samplesheet_file: - samplesheet_df = pd.read_csv( - samplesheet_file, - sep="\t", - usecols=[0], - header=0, - names=["genome"], - skiprows=1, - ) - samplesheet_genomes = set(samplesheet_df["genome"]) + samplesheet_df = pd.read_csv(samplesheet_file, sep='\t', usecols=[0], header=0, names=['genome_id'], skiprows=1) + samplesheet_genomes = set(samplesheet_df['genome_id']) else: samplesheet_genomes = set() @@ -77,28 +66,20 @@ def verify_genome_ids(tree, feature_file, samplesheet_file=None): missing_in_tree = feature_genomes.union(samplesheet_genomes) - tree_genomes if missing_in_tree: - print( - f"Error: The following genome IDs are missing in the phylogenetic tree: {', '.join(missing_in_tree)}" - ) + print(f"Error: The following genome IDs are missing in the phylogenetic tree: {', '.join(missing_in_tree)}") exit(1) - def generate_heatmap(output_df, output_heatmap): bins = np.arange(0, 1.1, 0.1) - max_genome_count = output_df["Genome Count"].max() - genome_bins = np.linspace( - 0, max_genome_count, 11 - ) # Generate 11 edges to create 10 bins - genome_bins_labels = [ - f"{int(genome_bins[i]) + 1}-{int(genome_bins[i + 1])}" - for i in range(len(genome_bins) - 1) - ] + max_genome_count = output_df['Genome Count'].max() + genome_bins = np.linspace(0, max_genome_count, 11) # Generate 11 edges to create 10 bins + genome_bins_labels = [f'{int(genome_bins[i]) + 1}-{int(genome_bins[i + 1])}' for i in range(len(genome_bins) - 1)] heatmap_data = pd.DataFrame(0, index=genome_bins_labels, columns=bins) for index, row in output_df.iterrows(): - pd_ratio = row["PD Ratio"] - genome_count = row["Genome Count"] + pd_ratio = row['PD Ratio'] + genome_count = row['Genome Count'] bin_idx = np.digitize(pd_ratio, bins) - 1 genome_bin_idx = np.digitize(genome_count, genome_bins) - 1 @@ -109,33 +90,21 @@ def generate_heatmap(output_df, output_heatmap): heatmap_data.iloc[genome_bin_idx, bin_idx] += 1 plt.figure(figsize=(12, 8)) - sns.heatmap( - np.log1p(heatmap_data), - cmap="Reds", - cbar_kws={"label": "Number of Features (log scale)"}, - annot=heatmap_data, - fmt="g", - linewidths=0.5, - ) - plt.xlabel("PD Ratio Bins") - plt.ylabel("Genome Count Bins") - plt.title("Heatmap of Features by PD Ratio and Genome Count") + sns.heatmap(np.log1p(heatmap_data), cmap="Reds", cbar_kws={'label': 'Number of Features (log scale)'}, annot=heatmap_data, fmt='g', linewidths=.5) + plt.xlabel('PD Ratio Bins') + plt.ylabel('Genome Count Bins') + plt.title('Heatmap of Features by PD Ratio and Genome Count') plt.xticks(ticks=np.arange(0.5, len(bins), 1), labels=np.round(bins, 1)) - plt.yticks( - ticks=np.arange(0.5, len(genome_bins_labels), 1), labels=genome_bins_labels - ) + plt.yticks(ticks=np.arange(0.5, len(genome_bins_labels), 1), labels=genome_bins_labels) plt.gca().invert_yaxis() plt.savefig(output_heatmap) plt.close() -def main( - tree_file, - feature_file, - output_base, - samplesheet_file=None, - samplesheet_columns=None, -): + + + +def main(tree_file, feature_file, output_base, samplesheet_file=None, samplesheet_columns=None): ref_tree = Tree(tree_file) # Verify genome IDs @@ -149,49 +118,40 @@ def main( # Read samplesheet if provided if samplesheet_file: - samplesheet_df = pd.read_csv(samplesheet_file, sep="\t", header=0) + samplesheet_df = pd.read_csv(samplesheet_file, sep='\t', header=0) available_columns = set(samplesheet_df.columns) - if "genome" not in available_columns: - print("Error: 'genome' column is missing in the samplesheet.") + if 'genome_id' not in available_columns: + print("Error: 'genome_id' column is missing in the samplesheet.") exit(1) samplesheet_data = {} for column in samplesheet_columns: if column in available_columns: - samplesheet_data[column] = ( - samplesheet_df[["genome", column]] - .set_index("genome")[column] - .to_dict() - ) + samplesheet_data[column] = samplesheet_df[['genome_id', column]].set_index('genome_id')[column].to_dict() else: - print( - f"Warning: Column '{column}' not found in the samplesheet. Skipping." - ) + print(f"Warning: Column '{column}' not found in the samplesheet. Skipping.") else: samplesheet_data = {} # Create an empty DataFrame to store the output output_columns = [ - "Feature Name", - "Total PD", - "Projected PD", - "PD Ratio", - "Genome Count", - "PD Ratio / Genome Count", + 'Feature Name', + 'Total PD', 'Projected PD', 'PD Ratio', + 'Genome Count','PD Ratio / Genome Count' ] # Add columns for each requested samplesheet column for column in samplesheet_columns: - output_columns.append(f"{column} Distinct Values") - output_columns.append(f"PD Ratio / {column} Values") + output_columns.append(f'{column} Distinct Values') + output_columns.append(f'PD Ratio / {column} Values') output_df = pd.DataFrame(columns=output_columns) # Iterate over each feature in feature_distr for index, row in feature_distr.iterrows(): - feature_name = row["feature"] - genomes_list = row["genomes_list"].split(",") - genome_count = row["total_count"] + feature_name = row['feature'] + genomes_list = row['genomes_list'].split(',') + genome_count = row['total_count'] # Only proceed if the feature is present in more than one genome if len(genomes_list) > 1: @@ -218,18 +178,14 @@ def main( round_to_sig_figs(projected_diversity, 4), round_to_sig_figs(ratio_diversity, 4), round_to_sig_figs(genome_count, 4), - round_to_sig_figs(genome_ratio_phylogenetic_diversity, 4), + round_to_sig_figs(genome_ratio_phylogenetic_diversity, 4) ] # Add values for each requested samplesheet column for column in samplesheet_columns: if column in samplesheet_data: # Identify distinct values for the column - distinct_values = set( - samplesheet_data[column].get(genome, None) - for genome in genomes_list - if genome in samplesheet_data[column] - ) + distinct_values = set(samplesheet_data[column].get(genome, None) for genome in genomes_list if genome in samplesheet_data[column]) distinct_values.discard(None) V = len(distinct_values) PD_ratio_per_V = ratio_diversity / V if V > 0 else 0 @@ -240,55 +196,26 @@ def main( # Add the row to the output DataFrame output_df.loc[index] = output_row - output_df_sorted = output_df.sort_values(by="PD Ratio / Genome Count") + output_df_sorted = output_df.sort_values(by='PD Ratio / Genome Count') # Save the output dataframe to a TSV file - output_df_sorted.to_csv(f"{output_base}.tsv", sep="\t", index=False) + output_df_sorted.to_csv(f"{output_base}.tsv", sep='\t', index=False) # Generate the heatmap generate_heatmap(output_df_sorted, f"{output_base}.png") - if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Calculate feature statistics based on phylogenetic tree and genus information." - ) - parser.add_argument( - "--tree_file", type=str, required=True, help="Path to the Newick tree file" - ) - parser.add_argument( - "--feature_file", - type=str, - required=True, - help="Path to the feature presence/absence file", - ) - parser.add_argument( - "--output_base", - type=str, - required=True, - help="Base name for the output files (without extension)", - ) - parser.add_argument( - "--samplesheet_file", - type=str, - help="Path to the file mapping genome IDs to other properties", - ) - parser.add_argument( - "--samplesheet_columns", - type=str, - help="Comma-separated list of columns to process from the samplesheet", - ) + parser = argparse.ArgumentParser(description='Calculate feature statistics based on phylogenetic tree and genus information.') + parser.add_argument('--tree_file', type=str, required=True, help='Path to the Newick tree file') + parser.add_argument('--feature_file', type=str, required=True, help='Path to the feature presence/absence file') + parser.add_argument('--output_base', type=str, required=True, help='Base name for the output files (without extension)') + parser.add_argument('--samplesheet_file', type=str, help='Path to the file mapping genome IDs to other properties') + parser.add_argument('--samplesheet_columns', type=str, help='Comma-separated list of columns to process from the samplesheet') args = parser.parse_args() if args.samplesheet_columns: - samplesheet_columns = args.samplesheet_columns.split(",") + samplesheet_columns = args.samplesheet_columns.split(',') else: samplesheet_columns = [] - main( - args.tree_file, - args.feature_file, - args.output_base, - args.samplesheet_file, - samplesheet_columns, - ) + main(args.tree_file, args.feature_file, args.output_base, args.samplesheet_file, samplesheet_columns) diff --git a/modules/local/featuredispersion/main.nf b/modules/local/featuredispersion/main.nf index 23b1b3d..f19dbe5 100644 --- a/modules/local/featuredispersion/main.nf +++ b/modules/local/featuredispersion/main.nf @@ -20,14 +20,15 @@ process FEATURE_DISPERSION { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' + def sheet = samplesheet ? "--samplesheet_file $samplesheet": '' + def columns = samplesheet_columns ? "--samplesheet_columns $samplesheet_columns": '' """ Feature_Dispersion.py \\ --output_base FeatureDispersion \\ --tree_file $core_tree \\ --feature_file $feature_profile \\ - --samplesheet_file $samplesheet \\ - --samplesheet_columns $samplesheet_columns + $sheet \\ + $columns """ stub: """ From d9e846dbd1ce1a6640d6af8da8f3e3fc3b037290 Mon Sep 17 00:00:00 2001 From: jvfe Date: Mon, 22 Jul 2024 17:21:17 -0300 Subject: [PATCH 5/9] refact: Add ability to read from samplesheet --- bin/Feature_Dispersion.py | 4 ++-- nextflow_schema.json | 19 +++++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/bin/Feature_Dispersion.py b/bin/Feature_Dispersion.py index a394e0c..422f42d 100755 --- a/bin/Feature_Dispersion.py +++ b/bin/Feature_Dispersion.py @@ -56,7 +56,7 @@ def verify_genome_ids(tree, feature_file, samplesheet_file=None): feature_genomes = set(feature_df.index) if samplesheet_file: - samplesheet_df = pd.read_csv(samplesheet_file, sep='\t', usecols=[0], header=0, names=['genome_id'], skiprows=1) + samplesheet_df = pd.read_csv(samplesheet_file, usecols=[0], header=0, names=['genome_id'], skiprows=1) samplesheet_genomes = set(samplesheet_df['genome_id']) else: samplesheet_genomes = set() @@ -118,7 +118,7 @@ def main(tree_file, feature_file, output_base, samplesheet_file=None, sampleshee # Read samplesheet if provided if samplesheet_file: - samplesheet_df = pd.read_csv(samplesheet_file, sep='\t', header=0) + samplesheet_df = pd.read_csv(samplesheet_file, header=0) available_columns = set(samplesheet_df.columns) if 'genome_id' not in available_columns: print("Error: 'genome_id' column is missing in the samplesheet.") diff --git a/nextflow_schema.json b/nextflow_schema.json index fae9918..73d9df3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,6 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [], "properties": { "input_sample_table": { "type": "string", @@ -179,6 +178,11 @@ "description": "Use FastTree", "fa_icon": "fas fa-tree", "default": true + }, + "feature_dispersion_columns": { + "type": "string", + "fa_icon": "fas fa-columns", + "description": "Columns from the input samplesheet to use in the feature dispersion module" } } }, @@ -217,7 +221,7 @@ }, "accessory_similarity": { "type": "number", - "default": 99.0, + "default": 99, "fa_icon": "far fa-clone", "description": "Similarity threshold for accessory genes" } @@ -475,7 +479,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -596,4 +607,4 @@ "$ref": "#/definitions/generic_options" } ] -} +} \ No newline at end of file From 3f22a8bcf3334cbbf200108f2dd953b739939ea5 Mon Sep 17 00:00:00 2001 From: jvfe Date: Mon, 22 Jul 2024 17:25:19 -0300 Subject: [PATCH 6/9] docs: Update param docs --- docs/params.md | 208 +++++++++++++++++++++++++------------------------ 1 file changed, 107 insertions(+), 101 deletions(-) diff --git a/docs/params.md b/docs/params.md index 8460d97..928da88 100644 --- a/docs/params.md +++ b/docs/params.md @@ -6,155 +6,161 @@ AMR/VF LGT-focused bacterial genomics workflow Define where the pipeline should find input data and save output data. -| Parameter | Description | Type | Default | Required | Hidden | -| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | --------- | -------- | ------ | -| `input_sample_table` | Path to comma-separated file containing information about the samples in the experiment.
HelpYou will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.
| `string` | | | | -| `outdir` | Path to the output directory where the results will be saved. | `string` | ./results | | | -| `db_cache` | Directory where the databases are located | `string` | | | | -| `email` | Email address for completion summary.
HelpSet this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.
| `string` | | | | -| `multiqc_title` | MultiQC report title. Printed as page header, used for filename if not otherwise specified. | `string` | | | | +| Parameter | Description | Type | Default | +|-----------|-----------|-----------|-----------| +| `input_sample_table` | Path to comma-separated file containing information about the samples in the experiment.
HelpYou will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.
| `string` | | +| `outdir` | Path to the output directory where the results will be saved. | `string` | ./results | +| `db_cache` | Directory where the databases are located | `string` | | +| `email` | Email address for completion summary.
HelpSet this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.
| `string` | | +| `multiqc_title` | MultiQC report title. Printed as page header, used for filename if not otherwise specified. | `string` | | ## Reference genome options Reference and outgroup genome fasta files required for the workflow. -| Parameter | Description | Type | Default | Required | Hidden | -| ------------------ | ------------------------------------ | -------- | ------- | -------- | ------ | -| `reference_genome` | Path to FASTA reference genome file. | `string` | | | | +| Parameter | Description | Type | Default | +|-----------|-----------|-----------|-----------| +| `reference_genome` | Path to FASTA reference genome file. | `string` | | ## QC -| Parameter | Description | Type | Default | Required | Hidden | -| --------------------- | ------------------------------------------ | --------- | ------- | -------- | ------ | -| `run_checkm` | Run CheckM QC software | `boolean` | | | | -| `apply_filtering` | Filter assemblies on QC results | `boolean` | | | | -| `skip_kraken` | Don't run Kraken2 taxonomic classification | `boolean` | | | | -| `min_n50` | Minimum N50 for filtering | `integer` | 10000 | | | -| `min_contigs_1000_bp` | Minimum number of contigs with >1000bp | `integer` | 1 | | | -| `min_contig_length` | Minimum average contig length | `integer` | 1 | | | + + +| Parameter | Description | Type | Default | +|-----------|-----------|-----------|-----------| +| `run_checkm` | Run CheckM QC software | `boolean` | | +| `apply_filtering` | Filter assemblies on QC results | `boolean` | | +| `skip_kraken` | Don't run Kraken2 taxonomic classification | `boolean` | | +| `min_n50` | Minimum N50 for filtering | `integer` | 10000 | +| `min_contigs_1000_bp` | Minimum number of contigs with >1000bp | `integer` | 1 | +| `min_contig_length` | Minimum average contig length | `integer` | 1 | ## Annotation Parameters for the annotation subworkflow -| Parameter | Description | Type | Default | Required | Hidden | -| ------------------------- | ------------------------------------------------ | --------- | -------------------------------------------------------------- | -------- | ------ | -| `annotation_tools` | Comma-separated list of annotation tools to run | `string` | mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy,report | | | -| `bakta_db` | Path to the BAKTA database | `string` | | | | -| `use_prokka` | Use Prokka (not Bakta) for annotating assemblies | `boolean` | | | | -| `min_pident` | Minimum match identity percentage for filtering | `integer` | 60 | | | -| `min_qcover` | Minimum coverage of each match for filtering | `number` | 0.6 | | | -| `skip_profile_creation` | Skip annotation feature profile creation | `boolean` | | | | -| `feature_profile_columns` | Columns to include in the feature profile | `string` | mobsuite,rgi,cazy,vfdb,iceberg,bacmet | | | +| Parameter | Description | Type | Default | +|-----------|-----------|-----------|-----------| +| `annotation_tools` | Comma-separated list of annotation tools to run | `string` | mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy,report | +| `bakta_db` | Path to the BAKTA database | `string` | | +| `use_prokka` | Use Prokka (not Bakta) for annotating assemblies | `boolean` | | +| `min_pident` | Minimum match identity percentage for filtering | `integer` | 60 | +| `min_qcover` | Minimum coverage of each match for filtering | `number` | 0.6 | +| `skip_profile_creation` | Skip annotation feature profile creation | `boolean` | | +| `feature_profile_columns` | Columns to include in the feature profile | `string` | mobsuite,rgi,cazy,vfdb,iceberg,bacmet | ## Phylogenomics Parameters for the phylogenomics subworkflow -| Parameter | Description | Type | Default | Required | Hidden | -| -------------------- | ---------------------------------------------- | --------- | ------- | -------- | ------ | -| `skip_phylo` | Skip Pangenomics and Phylogenomics subworkflow | `boolean` | | | | -| `use_ppanggolin` | Use ppanggolin for calculating the pangenome | `boolean` | | | | -| `use_full_alignment` | Use full alignment | `boolean` | | | | -| `use_fasttree` | Use FastTree | `boolean` | True | | | +| Parameter | Description | Type | Default | +|-----------|-----------|-----------|-----------| +| `skip_phylo` | Skip Pangenomics and Phylogenomics subworkflow | `boolean` | | +| `use_ppanggolin` | Use ppanggolin for calculating the pangenome | `boolean` | | +| `use_full_alignment` | Use full alignment | `boolean` | | +| `use_fasttree` | Use FastTree | `boolean` | True | +| `feature_dispersion_columns` | Columns from the input samplesheet to use in the feature dispersion module | `string` | | ## PopPUNK Parameters for the lineage subworkflow -| Parameter | Description | Type | Default | Required | Hidden | -| ---------------------- | ----------------------------------------------------------------------- | --------- | ------- | -------- | ------ | -| `skip_poppunk` | Skip PopPunk | `boolean` | | | | -| `poppunk_model` | Which PopPunk model to use (bgmm, dbscan, refine, threshold or lineage) | `string` | | | | -| `run_poppunk_qc` | Whether to run the QC step for PopPunk | `boolean` | | | | -| `enable_subsetting` | Enable subsetting workflow based on genome similarity | `boolean` | | | | -| `core_similarity` | Similarity threshold for core genomes | `number` | 99.99 | | | -| `accessory_similarity` | Similarity threshold for accessory genes | `number` | 99.0 | | | +| Parameter | Description | Type | Default | +|-----------|-----------|-----------|-----------| +| `skip_poppunk` | Skip PopPunk | `boolean` | | +| `poppunk_model` | Which PopPunk model to use (bgmm, dbscan, refine, threshold or lineage) | `string` | | +| `run_poppunk_qc` | Whether to run the QC step for PopPunk | `boolean` | | +| `enable_subsetting` | Enable subsetting workflow based on genome similarity | `boolean` | | +| `core_similarity` | Similarity threshold for core genomes | `number` | 99.99 | +| `accessory_similarity` | Similarity threshold for accessory genes | `number` | 99 | ## Gene Order Parameters for the Gene Order Subworkflow -| Parameter | Description | Type | Default | Required | Hidden | -| --------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ | -| `run_gene_order` | Whether to run the Gene Order subworkflow | `boolean` | | | | -| `gene_order_percent_cutoff` | Cutoff percentage of genomes a gene should be present within to be included in extraction and subsequent analysis. Should a float between 0 and 1 (e.g., 0.25 means only genes present in a minimum of 25% of genomes are kept). | `number` | 0.25 | | | -| `gene_order_label_cols` | If using annotation files predicting features, list of space separated column names to be added to the gene names | `string` | None | | | -| `num_neighbors` | Neighborhood size to extract. Should be an even number N, such that N/2 neighbors upstream and N/2 neighbors downstream will be analyzed. | `integer` | 10 | | | -| `inflation` | Inflation hyperparameter value for Markov Clustering Algorithm. | `integer` | 2 | | | -| `epsilon` | Epsilon hyperparameter value for DBSCAN clustering. | `number` | 0.5 | | | -| `minpts` | Minpts hyperparameter value for DBSCAN clustering. | `integer` | 5 | | | -| `plot_clustering` | Create Clustering HTML Plots | `boolean` | | | | +| Parameter | Description | Type | Default | +|-----------|-----------|-----------|-----------| +| `run_gene_order` | Whether to run the Gene Order subworkflow | `boolean` | | +| `input_file_path` | | `string` | /home/jvfe/dev/dalhousie/arete/test/gene-order/rgi_input.txt | +| `gene_order_percent_cutoff` | Cutoff percentage of genomes a gene should be present within to be included in extraction and subsequent analysis. Should a float between 0 and 1 (e.g., 0.25 means only genes present in a minimum of 25% of genomes are kept). | `number` | 0.25 | +| `gene_order_label_cols` | If using annotation files predicting features, list of space separated column names to be added to the gene names | `string` | None | +| `num_neighbors` | Neighborhood size to extract. Should be an even number N, such that N/2 neighbors upstream and N/2 neighbors downstream will be analyzed. | `integer` | 10 | +| `inflation` | Inflation hyperparameter value for Markov Clustering Algorithm. | `integer` | 2 | +| `epsilon` | Epsilon hyperparameter value for DBSCAN clustering. | `number` | 0.5 | +| `minpts` | Minpts hyperparameter value for DBSCAN clustering. | `integer` | 5 | +| `plot_clustering` | Create Clustering HTML Plots | `boolean` | | ## Recombination Parameters for the recombination subworkflow -| Parameter | Description | Type | Default | Required | Hidden | -| ------------------- | -------------------------------- | --------- | ------- | -------- | ------ | -| `run_recombination` | Run Recombination | `boolean` | | | | -| `run_verticall` | Run Verticall recombination tool | `boolean` | True | | | -| `run_gubbins` | Run Gubbins recombination tool | `boolean` | | | | +| Parameter | Description | Type | Default | +|-----------|-----------|-----------|-----------| +| `run_recombination` | Run Recombination | `boolean` | | +| `run_verticall` | Run Verticall recombination tool | `boolean` | True | +| `run_gubbins` | Run Gubbins recombination tool | `boolean` | | ## Dynamics -| Parameter | Description | Type | Default | Required | Hidden | -| ------------------------- | ------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ | -| `run_evolccm` | Run the community coevolution model | `boolean` | | | | -| `run_rspr` | Run rSPR | `boolean` | | | | -| `min_rspr_distance` | Minimum rSPR distance used to define processing groups | `integer` | 10 | | | -| `min_branch_length` | Minimum rSPR branch length | `integer` | 0 | | | -| `max_support_threshold` | Maximum rSPR support threshold | `number` | 0.7 | | | -| `max_approx_rspr` | Maximum approximate rSPR distance for filtering | `integer` | -1 | | | -| `min_heatmap_approx_rspr` | Minimum approximate rSPR distance used to generate heatmap | `integer` | 0 | | | -| `max_heatmap_approx_rspr` | Maximum approximate rSPR distance used to generate heatmap | `integer` | -1 | | | -| `min_heatmap_exact_rspr` | Minimum exact rSPR distance used to generate heatmap | `integer` | 0 | | | -| `max_heatmap_exact_rspr` | Maximum exact rSPR distance used to generate heatmap | `integer` | -1 | | | -| `core_gene_tree` | Core (or reference) genome tree. Used in the rSPR and evolCCM entries. | `string` | | | | -| `concatenated_annotation` | TSV table of annotations for all genomes. Such as the ones generated by Bakta or Prokka in ARETE. | `string` | | | | -| `feature_profile` | Feature profile TSV (A presence-absence matrix). Used in the evolCCM entry. | `string` | | | | + + +| Parameter | Description | Type | Default | +|-----------|-----------|-----------|-----------| +| `run_evolccm` | Run the community coevolution model | `boolean` | | +| `run_rspr` | Run rSPR | `boolean` | | +| `min_rspr_distance` | Minimum rSPR distance used to define processing groups | `integer` | 10 | +| `min_branch_length` | Minimum rSPR branch length | `integer` | 0 | +| `max_support_threshold` | Maximum rSPR support threshold | `number` | 0.7 | +| `max_approx_rspr` | Maximum approximate rSPR distance for filtering | `integer` | -1 | +| `min_heatmap_approx_rspr` | Minimum approximate rSPR distance used to generate heatmap | `integer` | 0 | +| `max_heatmap_approx_rspr` | Maximum approximate rSPR distance used to generate heatmap | `integer` | -1 | +| `min_heatmap_exact_rspr` | Minimum exact rSPR distance used to generate heatmap | `integer` | 0 | +| `max_heatmap_exact_rspr` | Maximum exact rSPR distance used to generate heatmap | `integer` | -1 | +| `core_gene_tree` | Core (or reference) genome tree. Used in the rSPR and evolCCM entries. | `string` | | +| `concatenated_annotation` | TSV table of annotations for all genomes. Such as the ones generated by Bakta or Prokka in ARETE. | `string` | | +| `feature_profile` | Feature profile TSV (A presence-absence matrix). Used in the evolCCM entry. | `string` | | ## Institutional config options Parameters used to describe centralised config profiles. These should not be edited. -| Parameter | Description | Type | Default | Required | Hidden | -| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------------------------------------------------------- | -------- | ------ | -| `custom_config_version` | Git commit id for Institutional configs. | `string` | master | | True | -| `custom_config_base` | Base directory for Institutional configs.
HelpIf you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.
| `string` | https://raw.githubusercontent.com/nf-core/configs/master | | True | -| `hostnames` | Institutional configs hostname. | `string` | | | True | -| `config_profile_name` | Institutional config name. | `string` | | | True | -| `config_profile_description` | Institutional config description. | `string` | | | True | -| `config_profile_contact` | Institutional config contact information. | `string` | | | True | -| `config_profile_url` | Institutional config URL link. | `string` | | | True | +| Parameter | Description | Type | Default | +|-----------|-----------|-----------|-----------| +| `custom_config_version` | Git commit id for Institutional configs. | `string` | master | +| `custom_config_base` | Base directory for Institutional configs.
HelpIf you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.
| `string` | https://raw.githubusercontent.com/nf-core/configs/master | +| `hostnames` | Institutional configs hostname. | `string` | | +| `config_profile_name` | Institutional config name. | `string` | | +| `config_profile_description` | Institutional config description. | `string` | | +| `config_profile_contact` | Institutional config contact information. | `string` | | +| `config_profile_url` | Institutional config URL link. | `string` | | ## Max job request options Set the top limit for requested resources for any single job. -| Parameter | Description | Type | Default | Required | Hidden | -| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ | -| `max_cpus` | Maximum number of CPUs that can be requested for any single job.
HelpUse to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`
| `integer` | 16 | | True | -| `max_memory` | Maximum amount of memory that can be requested for any single job.
HelpUse to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`
| `string` | 128.GB | | True | -| `max_time` | Maximum amount of time that can be requested for any single job.
HelpUse to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`
| `string` | 240.h | | True | +| Parameter | Description | Type | Default | +|-----------|-----------|-----------|-----------| +| `max_cpus` | Maximum number of CPUs that can be requested for any single job.
HelpUse to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`
| `integer` | 16 | +| `max_memory` | Maximum amount of memory that can be requested for any single job.
HelpUse to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`
| `string` | 128.GB | +| `max_time` | Maximum amount of time that can be requested for any single job.
HelpUse to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`
| `string` | 240.h | ## Generic options Less common options for the pipeline, typically set in a config file. -| Parameter | Description | Type | Default | Required | Hidden | -| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------- | ------------------------------ | -------- | ------ | -| `help` | Display help text. | `boolean` | | | True | -| `publish_dir_mode` | Method used to save pipeline results to output directory.
HelpThe Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.
| `string` | copy | | True | -| `email_on_fail` | Email address for completion summary, only when pipeline fails.
HelpAn email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.
| `string` | | | True | -| `plaintext_email` | Send plain-text email instead of HTML. | `boolean` | | | True | -| `max_multiqc_email_size` | File size limit when attaching MultiQC reports to summary emails. | `string` | 25.MB | | True | -| `monochrome_logs` | Do not use coloured log outputs. | `boolean` | | | True | -| `multiqc_config` | Custom config file to supply to MultiQC. | `string` | | | True | -| `tracedir` | Directory to keep pipeline Nextflow logs and reports. | `string` | ${params.outdir}/pipeline_info | | True | -| `validate_params` | Boolean whether to validate parameters against the schema at runtime | `boolean` | True | | True | -| `show_hidden_params` | Show all params when using `--help`
HelpBy default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters.
| `boolean` | | | True | -| `enable_conda` | Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter. | `boolean` | | | True | -| `singularity_pull_docker_container` | Instead of directly downloading Singularity images for use with Singularity, force the workflow to pull and convert Docker containers instead.
HelpThis may be useful for example if you are unable to directly pull Singularity containers to run the pipeline due to http/https proxy issues.
| `boolean` | | | True | -| `schema_ignore_params` | | `string` | genomes,modules | | | -| `multiqc_logo` | | `string` | | | True | +| Parameter | Description | Type | Default | +|-----------|-----------|-----------|-----------| +| `help` | Display help text. | `boolean` | | +| `publish_dir_mode` | Method used to save pipeline results to output directory.
HelpThe Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.
| `string` | copy | +| `email_on_fail` | Email address for completion summary, only when pipeline fails.
HelpAn email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.
| `string` | | +| `plaintext_email` | Send plain-text email instead of HTML. | `boolean` | | +| `max_multiqc_email_size` | File size limit when attaching MultiQC reports to summary emails. | `string` | 25.MB | +| `monochrome_logs` | Do not use coloured log outputs. | `boolean` | | +| `multiqc_config` | Custom config file to supply to MultiQC. | `string` | | +| `tracedir` | Directory to keep pipeline Nextflow logs and reports. | `string` | ${params.outdir}/pipeline_info | +| `validate_params` | Boolean whether to validate parameters against the schema at runtime | `boolean` | True | +| `show_hidden_params` | Show all params when using `--help`
HelpBy default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters.
| `boolean` | | +| `enable_conda` | Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter. | `boolean` | | +| `singularity_pull_docker_container` | Instead of directly downloading Singularity images for use with Singularity, force the workflow to pull and convert Docker containers instead.
HelpThis may be useful for example if you are unable to directly pull Singularity containers to run the pipeline due to http/https proxy issues.
| `boolean` | | +| `schema_ignore_params` | | `string` | genomes,modules | +| `multiqc_logo` | | `string` | | From 939a089ea921a35ad6409a0c69884d4d93b7c5a0 Mon Sep 17 00:00:00 2001 From: jvfe Date: Mon, 22 Jul 2024 17:26:59 -0300 Subject: [PATCH 7/9] refact: Change output dir --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 0b27497..cdadf67 100755 --- a/conf/modules.config +++ b/conf/modules.config @@ -461,7 +461,7 @@ process { withName: FEATURE_DISPERSION { publishDir = [ - path: { "${params.outdir}/annotation/feature_dispersion" }, + path: { "${params.outdir}/phylogenomics/feature_dispersion" }, mode: params.publish_dir_mode, ] } From 86a690299e5d48f24a92e0663e8bffa90dc14843 Mon Sep 17 00:00:00 2001 From: jvfe Date: Tue, 23 Jul 2024 09:27:38 -0300 Subject: [PATCH 8/9] fix: Improve check when there is no profile --- subworkflows/local/annotation.nf | 4 ++-- subworkflows/local/phylo.nf | 4 ++-- workflows/arete.nf | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf index 60777ef..c0d9d69 100755 --- a/subworkflows/local/annotation.nf +++ b/subworkflows/local/annotation.nf @@ -326,7 +326,7 @@ workflow ANNOTATE_ASSEMBLIES { ch_multiqc_files = ch_multiqc_files.mix(DIAMOND_BLAST_ICEBERG.out.log.collect{it[1]}.ifEmpty([])) } - profile = [] + profile = Channel.empty() if (tools_to_run.contains('report')) { needed_for_report = ['vfdb', 'rgi', 'mobsuite'] if (!params.use_prokka && needed_for_report.every { it in tools_to_run }) { @@ -353,7 +353,7 @@ workflow ANNOTATE_ASSEMBLIES { ) } - profile = (params.skip_profile_creation) ? [] : CREATE_REPORT.out.profile + profile = (params.skip_profile_creation) ? profile : CREATE_REPORT.out.profile } emit: diff --git a/subworkflows/local/phylo.nf b/subworkflows/local/phylo.nf index 254b36e..bc80713 100755 --- a/subworkflows/local/phylo.nf +++ b/subworkflows/local/phylo.nf @@ -26,6 +26,7 @@ workflow PHYLOGENOMICS{ feature_profile main: ch_software_versions = Channel.empty() + ch_feature_profile = feature_profile.ifEmpty(false) gffs .map { meta, path -> [meta.id, path.getName()] } @@ -119,7 +120,7 @@ workflow PHYLOGENOMICS{ ch_software_versions = ch_software_versions.mix(IQTREE.out.versions.ifEmpty(null)) } - if (feature_profile) { + if (ch_feature_profile != false) { if (params.feature_dispersion_columns) { FEATURE_DISPERSION( core_tree, @@ -130,7 +131,6 @@ workflow PHYLOGENOMICS{ } else { FEATURE_DISPERSION(core_tree, feature_profile, [], []) } - } emit: diff --git a/workflows/arete.nf b/workflows/arete.nf index 6cde691..9caa94d 100755 --- a/workflows/arete.nf +++ b/workflows/arete.nf @@ -613,7 +613,7 @@ workflow PHYLO { PHYLO_INPUT_CHECK.out.genomes.set { gffs } ////////////////////////// PANGENOME ///////////////////////////////////// - PHYLOGENOMICS(gffs, use_full_alignment, use_fasttree, []) + PHYLOGENOMICS(gffs, use_full_alignment, use_fasttree, Channel.empty()) ch_software_versions = ch_software_versions.mix(PHYLOGENOMICS.out.phylo_software) From 65d4eff2d8607340ade79c7ca79f8a247d463fe4 Mon Sep 17 00:00:00 2001 From: jvfe Date: Tue, 23 Jul 2024 09:33:09 -0300 Subject: [PATCH 9/9] tests: Add extra input to phylo test --- tests/subworkflows/local/phylo.nf.test | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/subworkflows/local/phylo.nf.test b/tests/subworkflows/local/phylo.nf.test index ef556f4..22108ef 100644 --- a/tests/subworkflows/local/phylo.nf.test +++ b/tests/subworkflows/local/phylo.nf.test @@ -23,6 +23,7 @@ nextflow_workflow { input[1] = false // Use fasttree input[2] = true + input[3] = Channel.empty() """ } }