Skip to content

Commit

Permalink
Merge pull request #195 from CDCgov/dev-patch3-ick4
Browse files Browse the repository at this point in the history
write fasta straight to Genbank file instead of merging with metadata first
  • Loading branch information
kyleoconnell authored Apr 9, 2024
2 parents 90d79ec + 9b3dc0a commit a68466d
Show file tree
Hide file tree
Showing 9 changed files with 64 additions and 92 deletions.
6 changes: 3 additions & 3 deletions assets/sample_fastas/Cdiphtheriae/BX248355.fasta
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
>BX248355-1-segment2 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 2/8
>BX248355 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 2/8
TTATATGAGCCCTTCCTTATCGAGTCATCTGCATCAGCGGCCTCGGTATCGGCCACCTGCGGTTCACTTG
GTGGCAGATCCATCGTTACTGGCTCGCCACGAACACTCACTACTGTAGTAAGTAGGGGGCTAGGCCACAT
AAGTGGGAAAGGGTGAGTTATATCATATGAAAATTGCCCGAAATAGGAGTAAAGAATTTTGCCTGCGATA
Expand Down Expand Up @@ -4072,7 +4072,7 @@ ACCTCACCTCACCTGCGAGAGGTGAGGTAGAAAACGTAACTGCGACCGATTACAGCGGGATGTTGCCGTG
CTTCTTCGCAGGAACGTTAACAACCTTGCGGTCCAAGAGGCGTAGCGCTTCAATGATCTGACCACGGGTT
TCGCTTGGTGGAATAACGGCGTCGACGTAACCACGCTCGGCTGCCATGTAAGGGTTGACCAAAGTTTCTT
CGTACTCGCGCTCGTATTCCTTAGCTACAGCGAGCATGTCCTTGCCTTCGGCAGCAGCGGCCTTGATTTC
>BX248355-1-segment3 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 3/8
>BX248355 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 3/8
CTTGCGGTAGATGAATCCCACGGCACCAGCGGCGCCCATGACAGCGATCTGTGCGGTTGGCCATGCCAAG
ACGAGGTCGGCGCCCATGTCCTTCGAACCCATCACGCAGTAAGCGCCACCGTAGGACTTGCGGGTAATCA
CAGTGATCTTGCCGACGGTTGCTTCAGAGTAGGCGTAGAGGAGCTTGGCGCCACGGCGAATGATTCCGTC
Expand Down Expand Up @@ -4442,7 +4442,7 @@ CGATTATTGGGCATGGTGCTCGTATTGGAGCCAATGCTCGCATTACCGGCTGTGTGATTGGTGAAGGCGC
GGAAATCGGTGCACGTTGTGAGCTTCGCGACGGCATGCGTGTGTGGCCAGGAGTTGTGATTCCAACTGCT
GGTATCCGTTTTAGTTCGGATGCCTAAGTTGCAGTAATACCTAGTGCAACCTAGTCATAGGCTCCCACCA
CGGCAGGGTAGACCCCCTCGTTGTGGTGGGAGCTTTCGCATTTTATGGAGGCTGGCTGAAGCGATTTCTT
>BX248355-1-segment4 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 4/8
>BX248355 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 4/8
AGTAGCTGCTGAGTGCTAGCACGTTTGTGTTCAAACACTTTTATGCGACACGCCGATAATGAGCCGACTA
GCGAAGTATTTTTAGAGGCACCACTATATGTAGTACCCCCACCGTATACCCCTATGGTGGTGGCAATGTT
ACTGGTGTGACTAATGAAGTTAACAACCTGCGGAAATGCAATTTTTCCGATCGGAATTACATGTTCAGGT
Expand Down
Binary file modified assets/sample_metadata/Cdiphtheriae_test_1.xlsx
Binary file not shown.
2 changes: 1 addition & 1 deletion bin/submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def args_parser():
help="Fasta file stored in submission directory",
required=True)
else:
print(f'fasta not required')
# fasta not required
file_parser.add_argument("--fasta_file",
help="Fasta file stored in submission directory",
required=False)
Expand Down
52 changes: 33 additions & 19 deletions bin/submission_create.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,10 @@ def create_gisaid_submission(organism, database, submission_name, submission_dir
# Create submission files
gisaid_df.to_csv(os.path.join(submission_files_dir, "metadata.csv"), index=False, sep=",")
shutil.copy(os.path.join(submission_files_dir, "metadata.csv"), os.path.join(submission_files_dir, "orig_metadata.csv"))
create_fasta(organism=organism, database="GISAID", metadata=metadata, submission_files_dir=submission_files_dir, fasta_file=fasta_file)
shutil.copy(os.path.join(submission_files_dir, "sequence.fsa"), os.path.join(submission_files_dir, "orig_sequence.fsa"))
"""
create_fasta(organism=organism, database="GISAID", submission_files_dir=submission_files_dir, fasta_file=fasta_file)
"""
shutil.copy(fasta_file, os.path.join(submission_files_dir, "orig_sequence.fsa"))
print("\n"+"Creating submission files for " + database, file=sys.stdout)
print("Files are stored at: " + os.path.join(submission_files_dir), file=sys.stdout)

Expand Down Expand Up @@ -404,27 +406,37 @@ def create_authorset(config_dict, metadata, submission_name, submission_files_di
f.write(" }\n")
f.write("}\n")

# Create fasta file based on database
def create_fasta(organism, database, metadata, fasta_file, submission_files_dir):
# Make sure sequence name is found in fasta file header
fasta_df = submission_process.process_fasta_samples(metadata=metadata, fasta_file=fasta_file)
# Now replace fasta header with appropriate sequence ids
# Extract the required fields for specified database
db_required_colnames = submission_process.get_required_colnames(database=database, organism=organism)
# Get the sample names with "#" symbol
sample_colname = list(filter(lambda x: ("#" in x)==True, db_required_colnames))[0].replace("#","").replace("*","")
# Create fasta file
records = []
for index, row in fasta_df.iterrows():
records.append(SeqRecord(row["fasta_sequence_orig"], id = row[sample_colname], description = ""))
with open(os.path.join(submission_files_dir, "sequence.fsa"), "w+") as f:
SeqIO.write(records, f, "fasta")
# get locus tag from gff file for Table2asn submission
def get_gff_locus_tag(gff_file):
""" Read the locus lag from the GFF3 file for use in table2asn command"""
locus_tag = None
with open(gff_file, 'r') as file:
for line in file:
if line.startswith('##FASTA'):
break # Stop reading if FASTA section starts
elif line.startswith('#'):
continue # Skip comment lines
else:
columns = line.strip().split('\t')
if columns[2] == 'CDS':
attributes = columns[8].split(';')
for attribute in attributes:
key, value = attribute.split('=')
if key == 'locus_tag':
locus_tag = value.split('_')[0]
break # Found locus tag, stop searching
if locus_tag:
break # Found locus tag, stop searching
return locus_tag

# Create a zip file for genbank submission
def create_genbank_files(organism, config_dict, metadata, fasta_file, submission_name, submission_files_dir):
# Create authorset file
create_authorset(config_dict=config_dict, metadata=metadata, submission_name=submission_name, submission_files_dir=submission_files_dir)
create_fasta(organism=organism, database=["GENBANK"], metadata=metadata, fasta_file=fasta_file, submission_files_dir=submission_files_dir)
"""
create_fasta(organism=organism, database=["GENBANK"], fasta_file=fasta_file, submission_files_dir=submission_files_dir)
"""
shutil.copy(fasta_file, os.path.join(submission_files_dir, "sequence.fsa"))
# Retrieve the source df
source_df = metadata.filter(regex="^gb-seq_id$|^src-|^ncbi-spuid$|^ncbi-bioproject$|^organism$|^collection_date$").copy()
source_df.columns = source_df.columns.str.replace("src-","").str.strip()
Expand Down Expand Up @@ -469,7 +481,9 @@ def create_genbank_table2asn(submission_dir, submission_name, submission_files_d
print("Downloading Table2asn.", file=sys.stdout)
download_table2asn(table2asn_dir=table2asn_dir)
# Command to generate table2asn submission file
command = [table2asn_dir, "-t", os.path.join(submission_files_dir, "authorset.sbt"), "-i", os.path.join(submission_files_dir, "sequence.fsa"), "-src-file", os.path.join(submission_files_dir, "source.src"), "-o", os.path.join(submission_files_dir, submission_name + ".sqn")]
command = [table2asn_dir, "-t", os.path.join(submission_files_dir, "authorset.sbt"), "-i", os.path.join(submission_files_dir, "sequence.fsa"), \
"-src-file", os.path.join(submission_files_dir, "source.src"), "-locus-tag-prefix", get_gff_locus_tag(gff_file), \
"-o", os.path.join(submission_files_dir, submission_name + ".sqn")]
if os.path.isfile(os.path.join(submission_files_dir, "comment.cmt")):
command.append("-w")
command.append( os.path.join(submission_files_dir, "comment.cmt"))
Expand Down
32 changes: 2 additions & 30 deletions bin/submission_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,36 +248,8 @@ def check_raw_read_files(submission_name, submission_dir, metadata):
return validated_files

# Check sample names in metadata file are listed in fasta file
def process_fasta_samples(metadata, fasta_file):
fasta_dict = []
# Convert fasta into df
with open(fasta_file, "r") as fsa:
records = SeqIO.parse(fsa, "fasta")
for record in records:
fasta_dict.append({"fasta_name_orig":record.id, "fasta_sequence_orig":record.seq, "fasta_description_orig":record.description})
fasta_df = pd.DataFrame(fasta_dict)
# Remove rows if they contain all Nan
fasta_df = fasta_df.dropna(how='all')
# Check duplicates in fasta_df
duplicated_df = fasta_df[fasta_df.duplicated(subset = ["fasta_name_orig"], keep = False)]
if not duplicated_df.empty:
print("Error: Sequences in fasta file must be unique at: " + fasta_file + "\nDuplicate Sequences\n" + fasta_df["fasta_sequence_orig"].to_string(index=False), file=sys.stderr)
sys.exit(1)
# Validate duplicates don't appear on merge
try:
merged_df = metadata.merge(fasta_df, how = "outer", left_on = "sequence_name", right_on = "fasta_name_orig", validate = "1:1")
except:
print("Error: Unable to merge fasta file to metadata file. Please validate there are not duplicate sequences in both files.", file=sys.stderr)
sys.exit(1)
# Check if fasta file has sequences not in metadata
if merged_df["sequence_name"].isnull().any():
print("Error: Sequences in fasta file do not have an associated sequence in metadata file. Please update sequences below:\n" + merged_df[merged_df["sequence_name"].isnull()]["fasta_name_orig"].to_string(), file=sys.stderr)
sys.exit(1)
# Check if metadata has sequences not in fasta file
if merged_df["fasta_name_orig"].isnull().any():
print("Error: Sequences in metadata file do not have an associated sequence in fasta file. Please update sequences below:\n" + merged_df[merged_df["fasta_name_orig"].isnull()]["sequence_name"].to_string(), file=sys.stderr)
sys.exit(1)
return merged_df
# Removed process_fasta_samples(metadata, fasta_file), originally referenced in submission_create.create_fasta
# Only purpose of this function was to check that fasta matched metadata, and this is done in nextflow now

# Update submission log
def update_submission_status(submission_dir, submission_name, organism, test, send_email=False):
Expand Down
6 changes: 3 additions & 3 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,23 @@ process {

withName: METADATA_VALIDATION {
publishDir = [
path: { "${params.output_dir}" },
path: { "${params.output_dir}/${params.validation_outputs}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: BAKTA {
publishDir = [
path: { "${params.output_dir/bakta_outputs}" },
path: { "${params.output_dir}/${params.bakta_outputs}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: VADR_POST_CLEANUP {
publishDir = [
path: { "${params.output_dir}" },
path: { "${params.output_dir}/${params.vadr_outputs}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
Expand Down
14 changes: 8 additions & 6 deletions conf/test_params.config
Original file line number Diff line number Diff line change
Expand Up @@ -148,18 +148,20 @@ params {
download_bakta_db = false
bakta_db_path = ""
bakta_min_contig_length = 200 // due to compliant mode default, otherwise default is 1
bakta_threads = 2
bakta_gram = "?"
bakta_genus = "Genus"
bakta_species = "species"
bakta_strain = "strain"
bakta_plasmid = "unnamed"
bakta_complete = ""
bakta_compliant = ""
bakta_translation_table = 11
bakta_gram = "?"
bakta_locus = "contig"
bakta_locus_tag = "LOCUSTAG123" // set a meaningful locus tag here for compliant mode
bakta_keep_contig_headers = ""
bakta_translation_table = 11

// optional bakta params, for now need to add these to modules/nf-core/bakta/bakta/main.nf
bakta_complete = ""
bakta_compliant = ""
bakta_keep_contig_headers = ""
bakta_prodigal_df = ""
bakta_replicons = ""
bakta_proteins = ""
Expand All @@ -174,7 +176,7 @@ params {
bakta_skip_sorf = ""
bakta_skip_gap = ""
bakta_skip_ori = ""
bakta_skip_plot = ""
bakta_skip_plot = true

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
43 changes: 14 additions & 29 deletions modules/nf-core/bakta/bakta/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ process BAKTA {
'https://depot.galaxyproject.org/singularity/bakta:1.9.1--pyhdfd78af_0' :
'quay.io/biocontainers/bakta:1.9.1--pyhdfd78af_0' }"

publishDir "$params.output_dir/$params.bakta_output_dir", mode: 'copy', overwrite: params.overwrite_output

input:
path db_path
tuple val(meta), path(fasta_path), path(fastq1), path(fastq2)
Expand All @@ -38,6 +36,7 @@ process BAKTA {
def skip_ori = params.bakta_skip_ori ? "--skip-ori" : ""
def compliant = params.bakta_compliant ? "--compliant" : ""
def complete = params.bakta_complete ? "--complete" : ""
def skip_plot = params.bakta_skip_plot ? "--skip-plot" : ""
def keep_contig_headers = params.bakta_keep_contig_headers ? "--keep-contig-headers" : ""

"""
Expand All @@ -53,34 +52,20 @@ process BAKTA {
--gram $params.bakta_gram \
--locus $params.bakta_locus \
--locus-tag $params.bakta_locus_tag \
$complete \
$compliant \
$keep_contig_headers \
$proteins \
$prodigal_tf \
$skip_trna \
$skip_rrna \
$skip_ncrna \
$skip_ncrna_region \
$skip_crispr \
$skip_cds \
$skip_sorf \
$skip_gap \
$skip_ori \
$fasta_path
$complete $compliant $keep_contig_headers $proteins $prodigal_tf $skip_trna $skip_rrna \
$skip_ncrna $skip_ncrna_region $skip_crispr $skip_cds $skip_sorf $skip_gap $skip_ori $skip_plot \
$fasta_path
"""

output:
path "${fasta_path.getSimpleName()}/*.fna", emit: fna
path "${fasta_path.getSimpleName()}/*.gff3", emit: gff3
path "${fasta_path.getSimpleName()}/*.faa", emit: faa
path "${fasta_path.getSimpleName()}/*.embl", emit: embl
path "${fasta_path.getSimpleName()}/*.ffn", emit: ffn
path "${fasta_path.getSimpleName()}/*.gbff", emit: gbff
path "${fasta_path.getSimpleName()}/*.json", emit: json
path "${fasta_path.getSimpleName()}/*.log", emit: log
path "${fasta_path.getSimpleName()}/*.png", emit: png
path "${fasta_path.getSimpleName()}/*.svg", emit: svg
path "${fasta_path.getSimpleName()}/*.tsv", emit: tsv
path "${fasta_path.getSimpleName()}/*.txt", emit: txt
path "${meta.id}/*.fna", emit: fna
path "${meta.id}/*.gff3", emit: gff3
path "${meta.id}/*.faa", emit: faa
path "${meta.id}/*.embl", emit: embl
path "${meta.id}/*.ffn", emit: ffn
path "${meta.id}/*.gbff", emit: gbff
path "${meta.id}/*.json", emit: json
path "${meta.id}/*.log", emit: log
path "${meta.id}/*.tsv", emit: tsv
path "${meta.id}/*.txt", emit: txt
}
1 change: 0 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ params {
// optional bakta params
bakta_complete = ""
bakta_compliant = ""
bakta_complete = ""
bakta_keep_contig_headers = ""
bakta_prodigal_tf = ""
bakta_replicons = ""
Expand Down

0 comments on commit a68466d

Please sign in to comment.