Merge pull request #69 from CDCgov/v1.2.1.-Bug-Fixes-Update

V1.2.1. Release
CDCgov · Sep 11, 2024 · fcc0410 · fcc0410
2 parents fee1a29 + fde148c
commit fcc0410
Show file tree

Hide file tree

Showing 515 changed files with 11,468 additions and 8,804 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,3 @@
+/shiny/*
+/vignettes/*
+/docs/*
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@ submit.ready
 *report.xml
 test_input/test_metadata.tsv
 upload_log.csv
+submission_log.csv
 *.vscode
 *.Rproj
 *.Rhistory
@@ -19,3 +20,7 @@ docker-compose-*.yaml
 
 # ignore folders
 **/.Rproj.user
+**/test_data/*
+**/gisaid_cli/*
+**/COV_TEST_DATA/*
+**/FLU_TEST_DATA/*
diff --git a/README.Rmd b/README.Rmd
@@ -26,7 +26,7 @@ github_pages_url <- description$GITHUB_PAGES
 
 <p style="font-size: 16px;"><em>Public Database Submission Pipeline</em></p>
 
-**Beta Version**: v1.2.0. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. Feedback and suggestions are welcome! 
+**Beta Version**: v1.2.1. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. Feedback and suggestions are welcome! 
 
 **General Disclaimer**: This repository was created for use by CDC programs to collaborate on public health related projects in support of the [CDC mission](https://www.cdc.gov/about/organization/mission.htm).  GitHub is not hosted by the CDC, but is a third party website used by CDC and its partners to share information and collaborate on software. CDC use of GitHub does not imply an endorsement of any one particular service, product, or enterprise.
 

diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@
 
 </p>
 
-**Beta Version**: 1.2.0. This pipeline is currently in Beta testing, and
+**Beta Version**: 1.2.1. This pipeline is currently in Beta testing, and
 issues could appear during submission. Please use it at your own risk.
 Feedback and suggestions are welcome\!
 

diff --git a/argument_handler.py b/argument_handler.py
@@ -72,7 +72,7 @@ def args_parser():
 		required=True)
 	file_parser.add_argument("--fasta_file",
 		help="Fasta file used to generate submission files; fasta header should match the column 'sequence_name' stored in your metadata. Input either full file path or if just file name it must be stored at '<submission_dir>/<submission_name>/<fasta_file>'.",
-		required=True)
+		default = None)
 	file_parser.add_argument("--table2asn",
 		help="Perform a table2asn submission instead of GenBank FTP submission for organism choices 'FLU' or 'COV'.",
 		required=False,

diff --git a/biosample_sra_handler.py b/biosample_sra_handler.py
@@ -58,7 +58,6 @@ def create_manual_submission_files(database: str, submission_dir: str, metadata:
 		column_ordered = ["sample_name","library_ID"]
 		prefix = "sra-"
 		# Create SRA specific fields
-		metadata["sra-title"] = config_dict["Description"]["Title"]
 		filename_cols = [col for col in metadata.columns.tolist() if re.match("sra-file_[1-9]\d*", col)]
 		# Correct index for filename column
 		for col in filename_cols:
@@ -69,8 +68,8 @@ def create_manual_submission_files(database: str, submission_dir: str, metadata:
 				rename_columns[col] = col.replace("sra-file_", "sra-filename")
 	elif "BIOSAMPLE" in database:
 		metadata_regex = "^bs-|^organism$|^collection_date$"
-		rename_columns = {"bs-description":"sample_title","bioproject":"bioproject_accession"}
-		drop_columns = ["bs-package"]
+		rename_columns = {"bioproject":"bioproject_accession"}
+		drop_columns = ["bs-title", "bs-comment", "bs-sample_title", "bs-sample_description"]
 		column_ordered = ["sample_name"]
 		prefix = "bs-"
 	else:
@@ -92,14 +91,31 @@ def create_manual_submission_files(database: str, submission_dir: str, metadata:
 	file_handler.save_csv(df=database_df, file_path=submission_dir, file_name="metadata.tsv", sep="\t")
 
 # Create submission XML
-def create_submission_xml(organism: str, database: str, submission_name: str, config_dict: Dict[str, Any], metadata: pd.DataFrame, failed_seqs_auto_removed: bool = True) -> bytes:
+def create_submission_xml(organism: str, database: str, submission_name: str, config_dict: Dict[str, Any], metadata: pd.DataFrame) -> bytes:
 	# Submission XML header
 	root = etree.Element("Submission")
 	description = etree.SubElement(root, "Description")
 	title = etree.SubElement(description, "Title")
-	title.text = config_dict["Description"]["Title"]
-	comment = etree.SubElement(description, "Comment")
-	comment.text = config_dict["Description"]["Comment"]
+	if "BIOSAMPLE" in database:
+		if "bs-title" in metadata and pd.notnull(metadata["bs-title"].iloc[0]) and metadata["bs-title"].iloc[0].strip() != 0:
+			title.text = metadata["bs-title"].iloc[0]
+		else:
+			title.text = submission_name + "-BS"
+		comment = etree.SubElement(description, "Comment")
+		if "bs-comment" in metadata and pd.notnull(metadata["bs-comment"].iloc[0]) and metadata["bs-comment"].iloc[0].strip() != 0:
+			comment.text = metadata["bs-comment"].iloc[0]
+		else:
+			comment.text = "BioSample Submission"
+	elif "SRA" in database:
+		if "sra-title" in metadata and pd.notnull(metadata["sra-title"].iloc[0]) and metadata["sra-title"].iloc[0].strip() != 0:
+			title.text = metadata["sra-title"].iloc[0]
+		else:
+			title.text = submission_name + "-SRA"
+		comment = etree.SubElement(description, "Comment")
+		if "sra-comment" in metadata and pd.notnull(metadata["sra-comment"].iloc[0]) and metadata["sra-comment"].iloc[0].strip() != 0:
+			comment.text = metadata["sra-comment"].iloc[0]
+		else:
+			comment.text = "SRA Submission"
 	# Description info including organization and contact info
 	organization = etree.SubElement(description, "Organization", type=config_dict["Description"]["Organization"]["Type"], role=config_dict["Description"]["Organization"]["Role"])
 	org_name = etree.SubElement(organization, "Name")
@@ -125,13 +141,18 @@ def create_submission_xml(organism: str, database: str, submission_name: str, co
 			sampleid = etree.SubElement(biosample, "SampleId")
 			spuid = etree.SubElement(sampleid, "SPUID", spuid_namespace=config_dict["Spuid_Namespace"])
 			spuid.text = row["bs-sample_name"]
-			descriptor = etree.SubElement(biosample, "Descriptor")
-			title = etree.SubElement(descriptor, "Title")
-			title.text = row["bs-description"]
+			if ("bs-sample_title" in metadata and pd.notnull(row["bs-sample_title"]) and row["bs-sample_title"].strip != "") or ("bs-sample_description" in metadata and pd.notnull(row["bs-sample_description"]) and row["bs-sample_description"].strip != ""):
+				descriptor = etree.SubElement(biosample, "Descriptor")
+				if "bs-sample_title" in metadata and pd.notnull(row["bs-sample_title"]) and row["bs-sample_title"].strip != "":
+					sample_title = etree.SubElement(descriptor, "Title")
+					sample_title.text = row["bs-sample_title"]
+				if "bs-sample_description" in metadata and pd.notnull(row["bs-sample_description"]) and row["bs-sample_description"].strip != "":
+					sample_description = etree.SubElement(descriptor, "Description")
+					sample_description.text = row["bs-sample_description"]
 			organismxml = etree.SubElement(biosample, "Organism")
 			organismname = etree.SubElement(organismxml, "OrganismName")
 			organismname.text = row["organism"]
-			if pd.notnull(row["bioproject"]) and row["bioproject"].strip() != "":
+			if "bioproject" in metadata and pd.notnull(row["bioproject"]) and row["bioproject"].strip() != "":
 				bioproject = etree.SubElement(biosample, "BioProject")
 				primaryid = etree.SubElement(bioproject, "PrimaryId", db="BioProject")
 				primaryid.text = row["bioproject"]
@@ -140,10 +161,12 @@ def create_submission_xml(organism: str, database: str, submission_name: str, co
 			# Attributes
 			attributes = etree.SubElement(biosample, "Attributes")
 			# Remove columns with bs-prefix that are not attributes
-			biosample_cols = [col for col in database_df.columns.tolist() if (col.startswith('bs-')) and (col not in ["bs-sample_name", "bs-package", "bs-description"])]
+			biosample_cols = [col for col in database_df.columns.tolist() if (col.startswith('bs-')) and (col not in ["bs-sample_name", "bs-package", "bs-title", "bs-comment", "bs-sample_title", "bs-sample_description"])]
 			for col in biosample_cols:
-				attribute = etree.SubElement(attributes, "Attribute", attribute_name=col.replace("bs-",""))
-				attribute.text = row[col]
+				attribute_value = row[col]
+				if pd.notnull(attribute_value) and attribute_value.strip() != "":
+					attribute = etree.SubElement(attributes, "Attribute", attribute_name=col.replace("bs-",""))
+					attribute.text = row[col]
 			# Add collection date to Attributes
 			attribute = etree.SubElement(attributes, "Attribute", attribute_name="collection_date")
 			attribute.text = row["collection_date"]
@@ -174,20 +197,21 @@ def create_submission_xml(organism: str, database: str, submission_name: str, co
 				datatype = etree.SubElement(file, "DataType")
 				datatype.text = "generic-data"
 			# Remove columns with sra- prefix that are not attributes
-			sra_cols = [col for col in database_df.columns.tolist() if col.startswith('sra-') and not re.match("(sra-sample_name|sra-file_location|sra-file_\d*)", col)]
+			sra_cols = [col for col in database_df.columns.tolist() if col.startswith('sra-') and not re.match("(sra-sample_name|sra-title|sra-comment|sra-file_location|sra-file_\d*)", col)]
 			for col in sra_cols:
-				attribute = etree.SubElement(addfiles, "Attribute", name=col.replace("sra-",""))
-				attribute.text = row[col]
+				attribute_value = row[col]
+				if pd.notnull(attribute_value) and attribute_value.strip() != "":
+					attribute = etree.SubElement(addfiles, "Attribute", name=col.replace("sra-",""))
+					attribute.text = row[col]
 			if pd.notnull(row["bioproject"]) and row["bioproject"].strip() != "":
 				attribute_ref_id = etree.SubElement(addfiles, "AttributeRefId", name="BioProject")
 				refid = etree.SubElement(attribute_ref_id, "RefId")
 				primaryid = etree.SubElement(refid, "PrimaryId")
 				primaryid.text = row["bioproject"]
-			if config_dict["Link_Sample_Between_NCBI_Databases"] and metadata.columns.str.contains("bs-sample_name").any():
-				attribute_ref_id = etree.SubElement(addfiles, "AttributeRefId", name="BioSample")
-				refid = etree.SubElement(attribute_ref_id, "RefId")
-				spuid = etree.SubElement(refid, "SPUID", spuid_namespace=config_dict["Spuid_Namespace"])
-				spuid.text = metadata.loc[metadata["sra-sample_name"] == row["sra-sample_name"], "bs-sample_name"].iloc[0]
+			attribute_ref_id = etree.SubElement(addfiles, "AttributeRefId", name="BioSample")
+			refid = etree.SubElement(attribute_ref_id, "RefId")
+			spuid = etree.SubElement(refid, "SPUID", spuid_namespace=config_dict["Spuid_Namespace"])
+			spuid.text = metadata.loc[metadata["sra-sample_name"] == row["sra-sample_name"], "bs-sample_name"].iloc[0]
 			identifier = etree.SubElement(addfiles, "Identifier")
 			spuid = etree.SubElement(identifier, "SPUID", spuid_namespace=config_dict["Spuid_Namespace"])
 			spuid.text = row["sra-sample_name"]
@@ -209,7 +233,7 @@ def create_biosample_sra_submission(organism: str, database: str, submission_nam
 		create_raw_reads_list(submission_dir=submission_dir, raw_files_list=raw_files_list)
 	manual_df = metadata.copy()
 	create_manual_submission_files(database=database, submission_dir=submission_dir, metadata=manual_df, config_dict=config_dict)
-	xml_str = create_submission_xml(organism=organism, database=database, submission_name=submission_name, metadata=metadata, config_dict=config_dict, failed_seqs_auto_removed=True)
+	xml_str = create_submission_xml(organism=organism, database=database, submission_name=submission_name, metadata=metadata, config_dict=config_dict)
 	file_handler.save_xml(xml_str, submission_dir)
 
 # Read xml report and get status of the submission

diff --git a/config/biosample/Beta-lactamase_1_0.py b/config/biosample/Beta-lactamase_1_0.py
@@ -14,6 +14,26 @@
 			description="Identifier name used for BioSample. Max length is 50 characters.",
 			title="sample_name",
 		),
+		"bs-sample_title": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Optional additional title for sample. Will be autogenerated by NCBI if not provided.",
+			title="sample title",
+		),
+		"bs-sample_description": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Optional description for sample.",
+			title="sample description",
+		),
 		"bs-strain": Column(
 			dtype="object",
 			checks=None,
@@ -124,6 +144,30 @@
 			description="The geographical coordinates of the location where the sample was collected. Specify as degrees latitude and longitude in format \"d[d.dddd] N|S d[dd.dddd] W|E\", eg, 38.98 N 77.11 W",
 			title="latitude and longitude",
 		),
+		"bs-title": Column(
+			dtype="object",
+			checks=[
+				Check(lambda s: s.nunique() == 1),
+			],
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Optional internal field for how the BioSample submission should be named when viewed from the NCBI submission portal. If not provided, when performing submissions <--submission_name> with the suffix \"-BS\" will be used instead.",
+			title="biosample submission portal name",
+		),
+		"bs-comment": Column(
+			dtype="object",
+			checks=[
+				Check(lambda s: s.nunique() == 1),
+			],
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Optional internal field explaining the purpose of the submission for when interacting and resolving submission issues with NCBI.",
+			title="biosample submission portal description",
+		)
 	},
 	checks=[
 		Check(lambda df: ~(df["bs-strain"].isnull() & df["bs-isolate"].isnull()), ignore_na = False),

diff --git a/config/biosample/Human_1_0.py b/config/biosample/Human_1_0.py
@@ -14,6 +14,26 @@
 			description="Identifier name used for BioSample. Max length is 50 characters.",
 			title="sample_name",
 		),
+		"bs-sample_title": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Optional additional title for sample. Will be autogenerated by NCBI if not provided.",
+			title="sample title",
+		),
+		"bs-sample_description": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Optional description for sample.",
+			title="sample description",
+		),
 		"bs-isolate": Column(
 			dtype="object",
 			checks=None,
@@ -220,6 +240,30 @@
 			required=False,
 			title="treatment",
 		),
+		"bs-title": Column(
+			dtype="object",
+			checks=[
+				Check(lambda s: s.nunique() == 1),
+			],
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Optional internal field for how the BioSample submission should be named when viewed from the NCBI submission portal. If not provided, when performing submissions <--submission_name> with the suffix \"-BS\" will be used instead.",
+			title="biosample submission portal name",
+		),
+		"bs-comment": Column(
+			dtype="object",
+			checks=[
+				Check(lambda s: s.nunique() == 1),
+			],
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Optional internal field explaining the purpose of the submission for when interacting and resolving submission issues with NCBI.",
+			title="biosample submission portal description",
+		)
 	},
 	checks=None,
 	index=None,

diff --git a/config/biosample/Invertebrate_1_0.py b/config/biosample/Invertebrate_1_0.py
@@ -14,6 +14,26 @@
 			description="Identifier name used for BioSample. Max length is 50 characters.",
 			title="sample_name",
 		),
+		"bs-sample_title": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Optional additional title for sample. Will be autogenerated by NCBI if not provided.",
+			title="sample title",
+		),
+		"bs-sample_description": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Optional description for sample.",
+			title="sample description",
+		),
 		"bs-isolate": Column(
 			dtype="object",
 			checks=None,
@@ -204,6 +224,30 @@
 			description="temperature of the sample at time of sampling",
 			title="temperature",
 		),
+		"bs-title": Column(
+			dtype="object",
+			checks=[
+				Check(lambda s: s.nunique() == 1),
+			],
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Optional internal field for how the BioSample submission should be named when viewed from the NCBI submission portal. If not provided, when performing submissions <--submission_name> with the suffix \"-BS\" will be used instead.",
+			title="biosample submission portal name",
+		),
+		"bs-comment": Column(
+			dtype="object",
+			checks=[
+				Check(lambda s: s.nunique() == 1),
+			],
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Optional internal field explaining the purpose of the submission for when interacting and resolving submission issues with NCBI.",
+			title="biosample submission portal description",
+		)
 	},
 	checks=[
 		Check(lambda df: ~(df["bs-isolate"].isnull() & df["bs-breed"].isnull()), ignore_na = False),