diff --git a/assets/sample_fastas/Cdiphtheriae/BX248355.fasta b/assets/sample_fastas/Cdiphtheriae/BX248355.fasta index e5b99871..b336fce1 100644 --- a/assets/sample_fastas/Cdiphtheriae/BX248355.fasta +++ b/assets/sample_fastas/Cdiphtheriae/BX248355.fasta @@ -1,4 +1,4 @@ ->BX248355-1-segment2 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 2/8 +>BX248355 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 2/8 TTATATGAGCCCTTCCTTATCGAGTCATCTGCATCAGCGGCCTCGGTATCGGCCACCTGCGGTTCACTTG GTGGCAGATCCATCGTTACTGGCTCGCCACGAACACTCACTACTGTAGTAAGTAGGGGGCTAGGCCACAT AAGTGGGAAAGGGTGAGTTATATCATATGAAAATTGCCCGAAATAGGAGTAAAGAATTTTGCCTGCGATA @@ -4072,7 +4072,7 @@ ACCTCACCTCACCTGCGAGAGGTGAGGTAGAAAACGTAACTGCGACCGATTACAGCGGGATGTTGCCGTG CTTCTTCGCAGGAACGTTAACAACCTTGCGGTCCAAGAGGCGTAGCGCTTCAATGATCTGACCACGGGTT TCGCTTGGTGGAATAACGGCGTCGACGTAACCACGCTCGGCTGCCATGTAAGGGTTGACCAAAGTTTCTT CGTACTCGCGCTCGTATTCCTTAGCTACAGCGAGCATGTCCTTGCCTTCGGCAGCAGCGGCCTTGATTTC ->BX248355-1-segment3 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 3/8 +>BX248355 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 3/8 CTTGCGGTAGATGAATCCCACGGCACCAGCGGCGCCCATGACAGCGATCTGTGCGGTTGGCCATGCCAAG ACGAGGTCGGCGCCCATGTCCTTCGAACCCATCACGCAGTAAGCGCCACCGTAGGACTTGCGGGTAATCA CAGTGATCTTGCCGACGGTTGCTTCAGAGTAGGCGTAGAGGAGCTTGGCGCCACGGCGAATGATTCCGTC @@ -4442,7 +4442,7 @@ CGATTATTGGGCATGGTGCTCGTATTGGAGCCAATGCTCGCATTACCGGCTGTGTGATTGGTGAAGGCGC GGAAATCGGTGCACGTTGTGAGCTTCGCGACGGCATGCGTGTGTGGCCAGGAGTTGTGATTCCAACTGCT GGTATCCGTTTTAGTTCGGATGCCTAAGTTGCAGTAATACCTAGTGCAACCTAGTCATAGGCTCCCACCA CGGCAGGGTAGACCCCCTCGTTGTGGTGGGAGCTTTCGCATTTTATGGAGGCTGGCTGAAGCGATTTCTT ->BX248355-1-segment4 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 4/8 +>BX248355 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 4/8 AGTAGCTGCTGAGTGCTAGCACGTTTGTGTTCAAACACTTTTATGCGACACGCCGATAATGAGCCGACTA GCGAAGTATTTTTAGAGGCACCACTATATGTAGTACCCCCACCGTATACCCCTATGGTGGTGGCAATGTT ACTGGTGTGACTAATGAAGTTAACAACCTGCGGAAATGCAATTTTTCCGATCGGAATTACATGTTCAGGT diff --git a/assets/sample_metadata/Cdiphtheriae_test_1.xlsx b/assets/sample_metadata/Cdiphtheriae_test_1.xlsx index edb7568f..b7867449 100644 Binary files a/assets/sample_metadata/Cdiphtheriae_test_1.xlsx and b/assets/sample_metadata/Cdiphtheriae_test_1.xlsx differ diff --git a/bin/submission.py b/bin/submission.py index 785df781..ab5b3d27 100755 --- a/bin/submission.py +++ b/bin/submission.py @@ -109,7 +109,7 @@ def args_parser(): help="Fasta file stored in submission directory", required=True) else: - print(f'fasta not required') + # fasta not required file_parser.add_argument("--fasta_file", help="Fasta file stored in submission directory", required=False) diff --git a/bin/submission_create.py b/bin/submission_create.py index ac88c6c1..6d5f6d63 100755 --- a/bin/submission_create.py +++ b/bin/submission_create.py @@ -123,8 +123,10 @@ def create_gisaid_submission(organism, database, submission_name, submission_dir # Create submission files gisaid_df.to_csv(os.path.join(submission_files_dir, "metadata.csv"), index=False, sep=",") shutil.copy(os.path.join(submission_files_dir, "metadata.csv"), os.path.join(submission_files_dir, "orig_metadata.csv")) - create_fasta(organism=organism, database="GISAID", metadata=metadata, submission_files_dir=submission_files_dir, fasta_file=fasta_file) - shutil.copy(os.path.join(submission_files_dir, "sequence.fsa"), os.path.join(submission_files_dir, "orig_sequence.fsa")) + """ + create_fasta(organism=organism, database="GISAID", submission_files_dir=submission_files_dir, fasta_file=fasta_file) + """ + shutil.copy(fasta_file, os.path.join(submission_files_dir, "orig_sequence.fsa")) print("\n"+"Creating submission files for " + database, file=sys.stdout) print("Files are stored at: " + os.path.join(submission_files_dir), file=sys.stdout) @@ -404,27 +406,37 @@ def create_authorset(config_dict, metadata, submission_name, submission_files_di f.write(" }\n") f.write("}\n") -# Create fasta file based on database -def create_fasta(organism, database, metadata, fasta_file, submission_files_dir): - # Make sure sequence name is found in fasta file header - fasta_df = submission_process.process_fasta_samples(metadata=metadata, fasta_file=fasta_file) - # Now replace fasta header with appropriate sequence ids - # Extract the required fields for specified database - db_required_colnames = submission_process.get_required_colnames(database=database, organism=organism) - # Get the sample names with "#" symbol - sample_colname = list(filter(lambda x: ("#" in x)==True, db_required_colnames))[0].replace("#","").replace("*","") - # Create fasta file - records = [] - for index, row in fasta_df.iterrows(): - records.append(SeqRecord(row["fasta_sequence_orig"], id = row[sample_colname], description = "")) - with open(os.path.join(submission_files_dir, "sequence.fsa"), "w+") as f: - SeqIO.write(records, f, "fasta") +# get locus tag from gff file for Table2asn submission +def get_gff_locus_tag(gff_file): + """ Read the locus lag from the GFF3 file for use in table2asn command""" + locus_tag = None + with open(gff_file, 'r') as file: + for line in file: + if line.startswith('##FASTA'): + break # Stop reading if FASTA section starts + elif line.startswith('#'): + continue # Skip comment lines + else: + columns = line.strip().split('\t') + if columns[2] == 'CDS': + attributes = columns[8].split(';') + for attribute in attributes: + key, value = attribute.split('=') + if key == 'locus_tag': + locus_tag = value.split('_')[0] + break # Found locus tag, stop searching + if locus_tag: + break # Found locus tag, stop searching + return locus_tag # Create a zip file for genbank submission def create_genbank_files(organism, config_dict, metadata, fasta_file, submission_name, submission_files_dir): # Create authorset file create_authorset(config_dict=config_dict, metadata=metadata, submission_name=submission_name, submission_files_dir=submission_files_dir) - create_fasta(organism=organism, database=["GENBANK"], metadata=metadata, fasta_file=fasta_file, submission_files_dir=submission_files_dir) + """ + create_fasta(organism=organism, database=["GENBANK"], fasta_file=fasta_file, submission_files_dir=submission_files_dir) + """ + shutil.copy(fasta_file, os.path.join(submission_files_dir, "sequence.fsa")) # Retrieve the source df source_df = metadata.filter(regex="^gb-seq_id$|^src-|^ncbi-spuid$|^ncbi-bioproject$|^organism$|^collection_date$").copy() source_df.columns = source_df.columns.str.replace("src-","").str.strip() @@ -469,7 +481,9 @@ def create_genbank_table2asn(submission_dir, submission_name, submission_files_d print("Downloading Table2asn.", file=sys.stdout) download_table2asn(table2asn_dir=table2asn_dir) # Command to generate table2asn submission file - command = [table2asn_dir, "-t", os.path.join(submission_files_dir, "authorset.sbt"), "-i", os.path.join(submission_files_dir, "sequence.fsa"), "-src-file", os.path.join(submission_files_dir, "source.src"), "-o", os.path.join(submission_files_dir, submission_name + ".sqn")] + command = [table2asn_dir, "-t", os.path.join(submission_files_dir, "authorset.sbt"), "-i", os.path.join(submission_files_dir, "sequence.fsa"), \ + "-src-file", os.path.join(submission_files_dir, "source.src"), "-locus-tag-prefix", get_gff_locus_tag(gff_file), \ + "-o", os.path.join(submission_files_dir, submission_name + ".sqn")] if os.path.isfile(os.path.join(submission_files_dir, "comment.cmt")): command.append("-w") command.append( os.path.join(submission_files_dir, "comment.cmt")) diff --git a/bin/submission_process.py b/bin/submission_process.py index 3e7f1c66..ea521f37 100755 --- a/bin/submission_process.py +++ b/bin/submission_process.py @@ -248,36 +248,8 @@ def check_raw_read_files(submission_name, submission_dir, metadata): return validated_files # Check sample names in metadata file are listed in fasta file -def process_fasta_samples(metadata, fasta_file): - fasta_dict = [] - # Convert fasta into df - with open(fasta_file, "r") as fsa: - records = SeqIO.parse(fsa, "fasta") - for record in records: - fasta_dict.append({"fasta_name_orig":record.id, "fasta_sequence_orig":record.seq, "fasta_description_orig":record.description}) - fasta_df = pd.DataFrame(fasta_dict) - # Remove rows if they contain all Nan - fasta_df = fasta_df.dropna(how='all') - # Check duplicates in fasta_df - duplicated_df = fasta_df[fasta_df.duplicated(subset = ["fasta_name_orig"], keep = False)] - if not duplicated_df.empty: - print("Error: Sequences in fasta file must be unique at: " + fasta_file + "\nDuplicate Sequences\n" + fasta_df["fasta_sequence_orig"].to_string(index=False), file=sys.stderr) - sys.exit(1) - # Validate duplicates don't appear on merge - try: - merged_df = metadata.merge(fasta_df, how = "outer", left_on = "sequence_name", right_on = "fasta_name_orig", validate = "1:1") - except: - print("Error: Unable to merge fasta file to metadata file. Please validate there are not duplicate sequences in both files.", file=sys.stderr) - sys.exit(1) - # Check if fasta file has sequences not in metadata - if merged_df["sequence_name"].isnull().any(): - print("Error: Sequences in fasta file do not have an associated sequence in metadata file. Please update sequences below:\n" + merged_df[merged_df["sequence_name"].isnull()]["fasta_name_orig"].to_string(), file=sys.stderr) - sys.exit(1) - # Check if metadata has sequences not in fasta file - if merged_df["fasta_name_orig"].isnull().any(): - print("Error: Sequences in metadata file do not have an associated sequence in fasta file. Please update sequences below:\n" + merged_df[merged_df["fasta_name_orig"].isnull()]["sequence_name"].to_string(), file=sys.stderr) - sys.exit(1) - return merged_df +# Removed process_fasta_samples(metadata, fasta_file), originally referenced in submission_create.create_fasta +# Only purpose of this function was to check that fasta matched metadata, and this is done in nextflow now # Update submission log def update_submission_status(submission_dir, submission_name, organism, test, send_email=False): diff --git a/conf/modules.config b/conf/modules.config index ba022c62..fa4399d6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -20,7 +20,7 @@ process { withName: METADATA_VALIDATION { publishDir = [ - path: { "${params.output_dir}" }, + path: { "${params.output_dir}/${params.validation_outputs}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -28,7 +28,7 @@ process { withName: BAKTA { publishDir = [ - path: { "${params.output_dir/bakta_outputs}" }, + path: { "${params.output_dir}/${params.bakta_outputs}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -36,7 +36,7 @@ process { withName: VADR_POST_CLEANUP { publishDir = [ - path: { "${params.output_dir}" }, + path: { "${params.output_dir}/${params.vadr_outputs}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] diff --git a/conf/test_params.config b/conf/test_params.config index ff6ce389..b950a8cc 100644 --- a/conf/test_params.config +++ b/conf/test_params.config @@ -148,18 +148,20 @@ params { download_bakta_db = false bakta_db_path = "" bakta_min_contig_length = 200 // due to compliant mode default, otherwise default is 1 + bakta_threads = 2 + bakta_gram = "?" bakta_genus = "Genus" bakta_species = "species" bakta_strain = "strain" bakta_plasmid = "unnamed" - bakta_complete = "" - bakta_compliant = "" - bakta_translation_table = 11 - bakta_gram = "?" bakta_locus = "contig" bakta_locus_tag = "LOCUSTAG123" // set a meaningful locus tag here for compliant mode - bakta_keep_contig_headers = "" + bakta_translation_table = 11 + // optional bakta params, for now need to add these to modules/nf-core/bakta/bakta/main.nf + bakta_complete = "" + bakta_compliant = "" + bakta_keep_contig_headers = "" bakta_prodigal_df = "" bakta_replicons = "" bakta_proteins = "" @@ -174,7 +176,7 @@ params { bakta_skip_sorf = "" bakta_skip_gap = "" bakta_skip_ori = "" - bakta_skip_plot = "" + bakta_skip_plot = true /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/modules/nf-core/bakta/bakta/main.nf b/modules/nf-core/bakta/bakta/main.nf index 0e0227a9..7a89c9a4 100644 --- a/modules/nf-core/bakta/bakta/main.nf +++ b/modules/nf-core/bakta/bakta/main.nf @@ -15,8 +15,6 @@ process BAKTA { 'https://depot.galaxyproject.org/singularity/bakta:1.9.1--pyhdfd78af_0' : 'quay.io/biocontainers/bakta:1.9.1--pyhdfd78af_0' }" - publishDir "$params.output_dir/$params.bakta_output_dir", mode: 'copy', overwrite: params.overwrite_output - input: path db_path tuple val(meta), path(fasta_path), path(fastq1), path(fastq2) @@ -38,6 +36,7 @@ process BAKTA { def skip_ori = params.bakta_skip_ori ? "--skip-ori" : "" def compliant = params.bakta_compliant ? "--compliant" : "" def complete = params.bakta_complete ? "--complete" : "" + def skip_plot = params.bakta_skip_plot ? "--skip-plot" : "" def keep_contig_headers = params.bakta_keep_contig_headers ? "--keep-contig-headers" : "" """ @@ -53,34 +52,20 @@ process BAKTA { --gram $params.bakta_gram \ --locus $params.bakta_locus \ --locus-tag $params.bakta_locus_tag \ - $complete \ - $compliant \ - $keep_contig_headers \ - $proteins \ - $prodigal_tf \ - $skip_trna \ - $skip_rrna \ - $skip_ncrna \ - $skip_ncrna_region \ - $skip_crispr \ - $skip_cds \ - $skip_sorf \ - $skip_gap \ - $skip_ori \ - $fasta_path + $complete $compliant $keep_contig_headers $proteins $prodigal_tf $skip_trna $skip_rrna \ + $skip_ncrna $skip_ncrna_region $skip_crispr $skip_cds $skip_sorf $skip_gap $skip_ori $skip_plot \ + $fasta_path """ output: - path "${fasta_path.getSimpleName()}/*.fna", emit: fna - path "${fasta_path.getSimpleName()}/*.gff3", emit: gff3 - path "${fasta_path.getSimpleName()}/*.faa", emit: faa - path "${fasta_path.getSimpleName()}/*.embl", emit: embl - path "${fasta_path.getSimpleName()}/*.ffn", emit: ffn - path "${fasta_path.getSimpleName()}/*.gbff", emit: gbff - path "${fasta_path.getSimpleName()}/*.json", emit: json - path "${fasta_path.getSimpleName()}/*.log", emit: log - path "${fasta_path.getSimpleName()}/*.png", emit: png - path "${fasta_path.getSimpleName()}/*.svg", emit: svg - path "${fasta_path.getSimpleName()}/*.tsv", emit: tsv - path "${fasta_path.getSimpleName()}/*.txt", emit: txt + path "${meta.id}/*.fna", emit: fna + path "${meta.id}/*.gff3", emit: gff3 + path "${meta.id}/*.faa", emit: faa + path "${meta.id}/*.embl", emit: embl + path "${meta.id}/*.ffn", emit: ffn + path "${meta.id}/*.gbff", emit: gbff + path "${meta.id}/*.json", emit: json + path "${meta.id}/*.log", emit: log + path "${meta.id}/*.tsv", emit: tsv + path "${meta.id}/*.txt", emit: txt } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index e69802a2..a273dd01 100644 --- a/nextflow.config +++ b/nextflow.config @@ -101,7 +101,6 @@ params { // optional bakta params bakta_complete = "" bakta_compliant = "" - bakta_complete = "" bakta_keep_contig_headers = "" bakta_prodigal_tf = "" bakta_replicons = ""