Skip to content

Commit

Permalink
Merge pull request #69 from CDCgov/v1.2.1.-Bug-Fixes-Update
Browse files Browse the repository at this point in the history
V1.2.1. Release
  • Loading branch information
dthoward96 authored Sep 11, 2024
2 parents fee1a29 + fde148c commit fcc0410
Show file tree
Hide file tree
Showing 515 changed files with 11,468 additions and 8,804 deletions.
3 changes: 3 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/shiny/*
/vignettes/*
/docs/*
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ submit.ready
*report.xml
test_input/test_metadata.tsv
upload_log.csv
submission_log.csv
*.vscode
*.Rproj
*.Rhistory
Expand All @@ -19,3 +20,7 @@ docker-compose-*.yaml

# ignore folders
**/.Rproj.user
**/test_data/*
**/gisaid_cli/*
**/COV_TEST_DATA/*
**/FLU_TEST_DATA/*
2 changes: 1 addition & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ github_pages_url <- description$GITHUB_PAGES

<p style="font-size: 16px;"><em>Public Database Submission Pipeline</em></p>

**Beta Version**: v1.2.0. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. Feedback and suggestions are welcome!
**Beta Version**: v1.2.1. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. Feedback and suggestions are welcome!

**General Disclaimer**: This repository was created for use by CDC programs to collaborate on public health related projects in support of the [CDC mission](https://www.cdc.gov/about/organization/mission.htm). GitHub is not hosted by the CDC, but is a third party website used by CDC and its partners to share information and collaborate on software. CDC use of GitHub does not imply an endorsement of any one particular service, product, or enterprise.

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

</p>

**Beta Version**: 1.2.0. This pipeline is currently in Beta testing, and
**Beta Version**: 1.2.1. This pipeline is currently in Beta testing, and
issues could appear during submission. Please use it at your own risk.
Feedback and suggestions are welcome\!

Expand Down
2 changes: 1 addition & 1 deletion argument_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def args_parser():
required=True)
file_parser.add_argument("--fasta_file",
help="Fasta file used to generate submission files; fasta header should match the column 'sequence_name' stored in your metadata. Input either full file path or if just file name it must be stored at '<submission_dir>/<submission_name>/<fasta_file>'.",
required=True)
default = None)
file_parser.add_argument("--table2asn",
help="Perform a table2asn submission instead of GenBank FTP submission for organism choices 'FLU' or 'COV'.",
required=False,
Expand Down
70 changes: 47 additions & 23 deletions biosample_sra_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def create_manual_submission_files(database: str, submission_dir: str, metadata:
column_ordered = ["sample_name","library_ID"]
prefix = "sra-"
# Create SRA specific fields
metadata["sra-title"] = config_dict["Description"]["Title"]
filename_cols = [col for col in metadata.columns.tolist() if re.match("sra-file_[1-9]\d*", col)]
# Correct index for filename column
for col in filename_cols:
Expand All @@ -69,8 +68,8 @@ def create_manual_submission_files(database: str, submission_dir: str, metadata:
rename_columns[col] = col.replace("sra-file_", "sra-filename")
elif "BIOSAMPLE" in database:
metadata_regex = "^bs-|^organism$|^collection_date$"
rename_columns = {"bs-description":"sample_title","bioproject":"bioproject_accession"}
drop_columns = ["bs-package"]
rename_columns = {"bioproject":"bioproject_accession"}
drop_columns = ["bs-title", "bs-comment", "bs-sample_title", "bs-sample_description"]
column_ordered = ["sample_name"]
prefix = "bs-"
else:
Expand All @@ -92,14 +91,31 @@ def create_manual_submission_files(database: str, submission_dir: str, metadata:
file_handler.save_csv(df=database_df, file_path=submission_dir, file_name="metadata.tsv", sep="\t")

# Create submission XML
def create_submission_xml(organism: str, database: str, submission_name: str, config_dict: Dict[str, Any], metadata: pd.DataFrame, failed_seqs_auto_removed: bool = True) -> bytes:
def create_submission_xml(organism: str, database: str, submission_name: str, config_dict: Dict[str, Any], metadata: pd.DataFrame) -> bytes:
# Submission XML header
root = etree.Element("Submission")
description = etree.SubElement(root, "Description")
title = etree.SubElement(description, "Title")
title.text = config_dict["Description"]["Title"]
comment = etree.SubElement(description, "Comment")
comment.text = config_dict["Description"]["Comment"]
if "BIOSAMPLE" in database:
if "bs-title" in metadata and pd.notnull(metadata["bs-title"].iloc[0]) and metadata["bs-title"].iloc[0].strip() != 0:
title.text = metadata["bs-title"].iloc[0]
else:
title.text = submission_name + "-BS"
comment = etree.SubElement(description, "Comment")
if "bs-comment" in metadata and pd.notnull(metadata["bs-comment"].iloc[0]) and metadata["bs-comment"].iloc[0].strip() != 0:
comment.text = metadata["bs-comment"].iloc[0]
else:
comment.text = "BioSample Submission"
elif "SRA" in database:
if "sra-title" in metadata and pd.notnull(metadata["sra-title"].iloc[0]) and metadata["sra-title"].iloc[0].strip() != 0:
title.text = metadata["sra-title"].iloc[0]
else:
title.text = submission_name + "-SRA"
comment = etree.SubElement(description, "Comment")
if "sra-comment" in metadata and pd.notnull(metadata["sra-comment"].iloc[0]) and metadata["sra-comment"].iloc[0].strip() != 0:
comment.text = metadata["sra-comment"].iloc[0]
else:
comment.text = "SRA Submission"
# Description info including organization and contact info
organization = etree.SubElement(description, "Organization", type=config_dict["Description"]["Organization"]["Type"], role=config_dict["Description"]["Organization"]["Role"])
org_name = etree.SubElement(organization, "Name")
Expand All @@ -125,13 +141,18 @@ def create_submission_xml(organism: str, database: str, submission_name: str, co
sampleid = etree.SubElement(biosample, "SampleId")
spuid = etree.SubElement(sampleid, "SPUID", spuid_namespace=config_dict["Spuid_Namespace"])
spuid.text = row["bs-sample_name"]
descriptor = etree.SubElement(biosample, "Descriptor")
title = etree.SubElement(descriptor, "Title")
title.text = row["bs-description"]
if ("bs-sample_title" in metadata and pd.notnull(row["bs-sample_title"]) and row["bs-sample_title"].strip != "") or ("bs-sample_description" in metadata and pd.notnull(row["bs-sample_description"]) and row["bs-sample_description"].strip != ""):
descriptor = etree.SubElement(biosample, "Descriptor")
if "bs-sample_title" in metadata and pd.notnull(row["bs-sample_title"]) and row["bs-sample_title"].strip != "":
sample_title = etree.SubElement(descriptor, "Title")
sample_title.text = row["bs-sample_title"]
if "bs-sample_description" in metadata and pd.notnull(row["bs-sample_description"]) and row["bs-sample_description"].strip != "":
sample_description = etree.SubElement(descriptor, "Description")
sample_description.text = row["bs-sample_description"]
organismxml = etree.SubElement(biosample, "Organism")
organismname = etree.SubElement(organismxml, "OrganismName")
organismname.text = row["organism"]
if pd.notnull(row["bioproject"]) and row["bioproject"].strip() != "":
if "bioproject" in metadata and pd.notnull(row["bioproject"]) and row["bioproject"].strip() != "":
bioproject = etree.SubElement(biosample, "BioProject")
primaryid = etree.SubElement(bioproject, "PrimaryId", db="BioProject")
primaryid.text = row["bioproject"]
Expand All @@ -140,10 +161,12 @@ def create_submission_xml(organism: str, database: str, submission_name: str, co
# Attributes
attributes = etree.SubElement(biosample, "Attributes")
# Remove columns with bs-prefix that are not attributes
biosample_cols = [col for col in database_df.columns.tolist() if (col.startswith('bs-')) and (col not in ["bs-sample_name", "bs-package", "bs-description"])]
biosample_cols = [col for col in database_df.columns.tolist() if (col.startswith('bs-')) and (col not in ["bs-sample_name", "bs-package", "bs-title", "bs-comment", "bs-sample_title", "bs-sample_description"])]
for col in biosample_cols:
attribute = etree.SubElement(attributes, "Attribute", attribute_name=col.replace("bs-",""))
attribute.text = row[col]
attribute_value = row[col]
if pd.notnull(attribute_value) and attribute_value.strip() != "":
attribute = etree.SubElement(attributes, "Attribute", attribute_name=col.replace("bs-",""))
attribute.text = row[col]
# Add collection date to Attributes
attribute = etree.SubElement(attributes, "Attribute", attribute_name="collection_date")
attribute.text = row["collection_date"]
Expand Down Expand Up @@ -174,20 +197,21 @@ def create_submission_xml(organism: str, database: str, submission_name: str, co
datatype = etree.SubElement(file, "DataType")
datatype.text = "generic-data"
# Remove columns with sra- prefix that are not attributes
sra_cols = [col for col in database_df.columns.tolist() if col.startswith('sra-') and not re.match("(sra-sample_name|sra-file_location|sra-file_\d*)", col)]
sra_cols = [col for col in database_df.columns.tolist() if col.startswith('sra-') and not re.match("(sra-sample_name|sra-title|sra-comment|sra-file_location|sra-file_\d*)", col)]
for col in sra_cols:
attribute = etree.SubElement(addfiles, "Attribute", name=col.replace("sra-",""))
attribute.text = row[col]
attribute_value = row[col]
if pd.notnull(attribute_value) and attribute_value.strip() != "":
attribute = etree.SubElement(addfiles, "Attribute", name=col.replace("sra-",""))
attribute.text = row[col]
if pd.notnull(row["bioproject"]) and row["bioproject"].strip() != "":
attribute_ref_id = etree.SubElement(addfiles, "AttributeRefId", name="BioProject")
refid = etree.SubElement(attribute_ref_id, "RefId")
primaryid = etree.SubElement(refid, "PrimaryId")
primaryid.text = row["bioproject"]
if config_dict["Link_Sample_Between_NCBI_Databases"] and metadata.columns.str.contains("bs-sample_name").any():
attribute_ref_id = etree.SubElement(addfiles, "AttributeRefId", name="BioSample")
refid = etree.SubElement(attribute_ref_id, "RefId")
spuid = etree.SubElement(refid, "SPUID", spuid_namespace=config_dict["Spuid_Namespace"])
spuid.text = metadata.loc[metadata["sra-sample_name"] == row["sra-sample_name"], "bs-sample_name"].iloc[0]
attribute_ref_id = etree.SubElement(addfiles, "AttributeRefId", name="BioSample")
refid = etree.SubElement(attribute_ref_id, "RefId")
spuid = etree.SubElement(refid, "SPUID", spuid_namespace=config_dict["Spuid_Namespace"])
spuid.text = metadata.loc[metadata["sra-sample_name"] == row["sra-sample_name"], "bs-sample_name"].iloc[0]
identifier = etree.SubElement(addfiles, "Identifier")
spuid = etree.SubElement(identifier, "SPUID", spuid_namespace=config_dict["Spuid_Namespace"])
spuid.text = row["sra-sample_name"]
Expand All @@ -209,7 +233,7 @@ def create_biosample_sra_submission(organism: str, database: str, submission_nam
create_raw_reads_list(submission_dir=submission_dir, raw_files_list=raw_files_list)
manual_df = metadata.copy()
create_manual_submission_files(database=database, submission_dir=submission_dir, metadata=manual_df, config_dict=config_dict)
xml_str = create_submission_xml(organism=organism, database=database, submission_name=submission_name, metadata=metadata, config_dict=config_dict, failed_seqs_auto_removed=True)
xml_str = create_submission_xml(organism=organism, database=database, submission_name=submission_name, metadata=metadata, config_dict=config_dict)
file_handler.save_xml(xml_str, submission_dir)

# Read xml report and get status of the submission
Expand Down
44 changes: 44 additions & 0 deletions config/biosample/Beta-lactamase_1_0.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,26 @@
description="Identifier name used for BioSample. Max length is 50 characters.",
title="sample_name",
),
"bs-sample_title": Column(
dtype="object",
checks=None,
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional additional title for sample. Will be autogenerated by NCBI if not provided.",
title="sample title",
),
"bs-sample_description": Column(
dtype="object",
checks=None,
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional description for sample.",
title="sample description",
),
"bs-strain": Column(
dtype="object",
checks=None,
Expand Down Expand Up @@ -124,6 +144,30 @@
description="The geographical coordinates of the location where the sample was collected. Specify as degrees latitude and longitude in format \"d[d.dddd] N|S d[dd.dddd] W|E\", eg, 38.98 N 77.11 W",
title="latitude and longitude",
),
"bs-title": Column(
dtype="object",
checks=[
Check(lambda s: s.nunique() == 1),
],
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional internal field for how the BioSample submission should be named when viewed from the NCBI submission portal. If not provided, when performing submissions <--submission_name> with the suffix \"-BS\" will be used instead.",
title="biosample submission portal name",
),
"bs-comment": Column(
dtype="object",
checks=[
Check(lambda s: s.nunique() == 1),
],
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional internal field explaining the purpose of the submission for when interacting and resolving submission issues with NCBI.",
title="biosample submission portal description",
)
},
checks=[
Check(lambda df: ~(df["bs-strain"].isnull() & df["bs-isolate"].isnull()), ignore_na = False),
Expand Down
44 changes: 44 additions & 0 deletions config/biosample/Human_1_0.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,26 @@
description="Identifier name used for BioSample. Max length is 50 characters.",
title="sample_name",
),
"bs-sample_title": Column(
dtype="object",
checks=None,
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional additional title for sample. Will be autogenerated by NCBI if not provided.",
title="sample title",
),
"bs-sample_description": Column(
dtype="object",
checks=None,
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional description for sample.",
title="sample description",
),
"bs-isolate": Column(
dtype="object",
checks=None,
Expand Down Expand Up @@ -220,6 +240,30 @@
required=False,
title="treatment",
),
"bs-title": Column(
dtype="object",
checks=[
Check(lambda s: s.nunique() == 1),
],
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional internal field for how the BioSample submission should be named when viewed from the NCBI submission portal. If not provided, when performing submissions <--submission_name> with the suffix \"-BS\" will be used instead.",
title="biosample submission portal name",
),
"bs-comment": Column(
dtype="object",
checks=[
Check(lambda s: s.nunique() == 1),
],
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional internal field explaining the purpose of the submission for when interacting and resolving submission issues with NCBI.",
title="biosample submission portal description",
)
},
checks=None,
index=None,
Expand Down
44 changes: 44 additions & 0 deletions config/biosample/Invertebrate_1_0.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,26 @@
description="Identifier name used for BioSample. Max length is 50 characters.",
title="sample_name",
),
"bs-sample_title": Column(
dtype="object",
checks=None,
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional additional title for sample. Will be autogenerated by NCBI if not provided.",
title="sample title",
),
"bs-sample_description": Column(
dtype="object",
checks=None,
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional description for sample.",
title="sample description",
),
"bs-isolate": Column(
dtype="object",
checks=None,
Expand Down Expand Up @@ -204,6 +224,30 @@
description="temperature of the sample at time of sampling",
title="temperature",
),
"bs-title": Column(
dtype="object",
checks=[
Check(lambda s: s.nunique() == 1),
],
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional internal field for how the BioSample submission should be named when viewed from the NCBI submission portal. If not provided, when performing submissions <--submission_name> with the suffix \"-BS\" will be used instead.",
title="biosample submission portal name",
),
"bs-comment": Column(
dtype="object",
checks=[
Check(lambda s: s.nunique() == 1),
],
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional internal field explaining the purpose of the submission for when interacting and resolving submission issues with NCBI.",
title="biosample submission portal description",
)
},
checks=[
Check(lambda df: ~(df["bs-isolate"].isnull() & df["bs-breed"].isnull()), ignore_na = False),
Expand Down
Loading

0 comments on commit fcc0410

Please sign in to comment.