Skip to content

Commit

Permalink
Merge pull request #249 from CDCgov/ick4-patch-enable-metadata-val-flags
Browse files Browse the repository at this point in the history
fully integrate the keep_demographic_info and date_format flags
  • Loading branch information
RamiyapriyaS authored Jan 21, 2025
2 parents 481a9f7 + 3ab77d3 commit a614b35
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 96 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ Refer to the wiki for more information on input parameters and use cases
### 7. Custom metadata validation and custom BioSample package

TOSTADAS defaults to Pathogen.cl.1.0 (Pathogen: clinical or host-associated; version 1.0) NCBI BioSample package for submissions to the BioSample repository. You can submit using a different BioSample package by doing the following:
1. Change the package name in the `conf/submission_config.yamlsubmissions`. Choose one of the available [NCBI BioSample packages](https://www.ncbi.nlm.nih.gov/biosample/docs/packages/).
1. Change the package name in the `conf/submission_config.yaml`. Choose one of the available [NCBI BioSample packages](https://www.ncbi.nlm.nih.gov/biosample/docs/packages/).
2. Add the necessary fields for your BioSample package to your input Excel file.
3. Add those fields as keys to the JSON file (`assets/custom_meta_fields/example_custom_fields.json`) and provide key info as needed.
replace_empty_with: TOSTADAS will replace any empty cells with this value (Example application: NCBI expects some value for any mandatory field, so if empty you may want to change it to "Not Provided".)
Expand Down
213 changes: 126 additions & 87 deletions bin/validate_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def metadata_validation_main():
parameters_class.get_parameters()
parameters = parameters_class.parameters
try:
assert len([x for x in parameters.keys() if x in ['fasta_path', 'meta_path', 'output_dir', 'condaEnv', 'keep_personal_info',
assert len([x for x in parameters.keys() if x in ['fasta_path', 'meta_path', 'output_dir', 'condaEnv', 'remove_demographic_info',
'date_format_flag', 'file_name', 'restricted_terms',
'illumina_instrument_restrictions', 'nanopore_instrument_restrictions',
'fasta_names', 'overwrite_output_files',
Expand Down Expand Up @@ -122,14 +122,14 @@ def get_args():
parser.add_argument("-o", "--output_dir", type=str, default='validation_outputs',
help="Output Directory for final Files, default is current directory")
parser.add_argument("--overwrite_output_files", type=bool, default=True, help='whether to overwrite the output dir')
parser.add_argument("-k", "--keep_personal_info", action="store_true", default=False,
help="Flag to keep personal identifying info if provided otherwise it will return an " +
"error if personal information is provided.")
parser.add_argument("-k", "--remove_demographic_info", action="store_true", default=False,
help="Flag to remove potentially identifying demographic info if provided otherwise no change will be made " +
"Applies to host_sex, host_age, race, ethnicity.")
parser.add_argument("-d", "--date_format_flag", type=str, default="s", choices=['s', 'o', 'v'],
help="Flag to differ date output, s = default (YYYY-MM), " +
"o = original(this skips date validation), v = verbose(YYYY-MM-DD)")
parser.add_argument("--custom_fields_file", type=str, help="File containing custom fields, datatypes, and which samples to check")
parser.add_argument("--validate_custom_fields", type=bool, help="Flag for whether or not validate custom fields ")
parser.add_argument("--validate_custom_fields", type=bool, default=True, help="Flag for whether or not validate custom fields ")
return parser

def get_restrictions(self):
Expand Down Expand Up @@ -248,7 +248,6 @@ def __init__(self, filled_df, parameters):

# global variables for keeping track of sample properties
self.did_validation_work = True
self.case_data_detected = False
self.valid_sample_num = 0
self.list_of_sample_errors = []
self.list_of_sample_dfs = {}
Expand Down Expand Up @@ -318,19 +317,16 @@ def validate_main(self):
self.metadata_df.loc[self.metadata_df['sample_name'] == name, 'author'] = fixed_authors
except:
self.author_valid = False
self.sample_error_msg = "\n\t Invalid Author Name, please list as full names seperated by ;"
self.sample_error_msg = "\n\t Invalid Author Name, please list as full names separated by ;"

# run the check on the PI meta information is the keep_personal_info flag is true
# run the check on the PI meta information if the remove_demographic_info flag is true
try:
assert self.case_data_detected is False
assert self.meta_case_grade is True
except AssertionError:
raise AssertionError(f'Either case_data_detected is not False or meta_case_grade is not True default values')
if self.parameters['keep_personal_info'] is True:
self.check_meta_case(sample_info)
if self.meta_case_grade is False:
# just for tracking globally that there was an irregularity with empty personal information
self.case_data_detected = True
raise AssertionError(f'meta_case_grade was not reset to True')
if self.parameters['remove_demographic_info'] is True:
self.sample_error_msg += (f"\n\t\t'remove_demographic_info' flag is True. Sample demographic data will be removed if present.")
self.metadata_df = self.check_meta_case(sample_info)

# check if the SRA submission is triggered, if it is, then run the class of functions for handling sra submission
if str(sample_info["ncbi_sequence_name_sra"]) != "" or str(sample_info["ncbi_sequence_name_sra"]) != '':
Expand Down Expand Up @@ -402,7 +398,7 @@ def validate_main(self):
self.did_validation_work = errors_class.capture_final_error (
final_error_file = self.final_error_file, repeat_error = self.repeat_error,
matchup_error = self.matchup_error, valid_date_flag = self.valid_date_flag,
date_error_msg = self.date_error_msg, case_data_detected = self.case_data_detected, valid_sample_num = self.valid_sample_num,
date_error_msg = self.date_error_msg, valid_sample_num = self.valid_sample_num,
metadata_df = self.metadata_df, list_of_sample_errors = self.list_of_sample_errors, repeated = self.repeated,
did_validation_work = self.did_validation_work,
)
Expand Down Expand Up @@ -455,73 +451,115 @@ def check_for_repeats_in_meta(self):

def check_date(self):
"""
Reformats the date ... expects YYYY or YYYY-MM-DD or YYYY-MM... if dates are empty then outputs error in txt file.
Otherwise it throws and assertion error and stops the program to tell user to check date formats and input flag
Reformats the date based on the input flag. Accepts date formats as YYYY, YYYY-MM, YYYY-MM-DD, MMDDYYYY, MMYYYY.
Flags dates with two-digit years (YY) as invalid.
Outputs error details in a text file for missing or invalid dates and raises an error if the flag is not valid.
"""
try:
assert self.valid_date_flag is True
except AssertionError:
raise AssertionError(f'Valid date flag is not proper default value of True')
raise AssertionError(f"Valid date flag is not properly set to the default value of True")

dates_list = self.metadata_df["collection_date"].tolist()
samples_list = self.metadata_df["sample_name"].tolist()
dates_holder = {'missing_dates': [], 'invalid_dates': []}

# based on input flag reformats the date
for i in range(len(dates_list)):
if dates_list[i] == "" or dates_list[i] == '' or dates_list[i] == None or dates_list[i] == []:
dates_holder['missing_dates'].append(samples_list[i])

# Updated regex patterns for different formats
date_patterns = [
# Matches different date forms, all patterns accept '-' or '/' separator, and all have optional time
r"^(\d{4})(?:[-/](\d{1,2}))?(?:[-/](\d{1,2}))?(?:\s(\d{2}):(\d{2}):(\d{2}))?$", # YYYY, YYYY/MM, YYYY/MM/DD, YYYY/M, YYYY/M/DD
r"^(\d{1,2})(?:[-/](\d{1,2}))?(?:[-/](\d{4}))?(?:\s(\d{2}):(\d{2}):(\d{2}))?$", # M/D/YYYY, MM/DD/YYYY
r"^(\d{1,2})(?:[-/](\d{4}))?(?:\s(\d{2}):(\d{2}):(\d{2}))?$", # M/YYYY, MM/YYYY
r"^(\d{1,2})(?:[-/](\d{1,2}))[-/](\d{2})(?:\s(\d{2}):(\d{2}):(\d{2}))?$" # M/D/YY, MM/DD/YY
]

print(f"flag: {self.parameters['date_format_flag']}")
for i, date_value in enumerate(dates_list):
sample_name = samples_list[i]

# Handle missing or empty dates
if not date_value:
dates_holder['missing_dates'].append(sample_name)
self.valid_date_flag = False
continue

# checks if the date is in y-m format and converts to y-m-d format (puts 00 at end?)
elif self.parameters['date_format_flag'].lower() == 'v' and str(dates_list[i]).count('-') == 1:
try:
dates_list[i] = f'{dates_list[i]}-00'
except:
dates_holder['invalid_dates'].append(samples_list[i])
self.valid_date_flag = False

# checks if the date is in y-m-d format and needs to be converted to y-m format (without day)
elif self.parameters['date_format_flag'].lower() == 's' and str(dates_list[i]).count('-') == 2:
try:
dates_list[i] = '-'.join(dates_list[i].split('-')[:1])
except:
dates_holder['invalid_dates'].append(samples_list[i])
self.valid_date_flag = False

elif str(dates_list[i]).count('-') == 0:
if self.parameters['date_format_flag'].lower() == 's':
try:
dates_list[i] = f'{dates_list[i]}-00'
except:
dates_holder['invalid_dates'].append(samples_list[i])
self.valid_date_flag = False
elif self.parameters['date_format_flag'].lower() == 'v':
try:
dates_list[i] = f'{dates_list[i]}-00-00'
except:
dates_holder['invalid_dates'].append(samples_list[i])
self.valid_date_flag = False

# output error messages collected
if self.valid_date_flag is False:
try:
assert True in [len(dates_holder['missing_dates']) != 0, len(dates_holder['invalid_dates']) != 0]
except AssertionError:
raise AssertionError(f'Valid date flag was set as false, but recorded missing or invalid dates was empty')
self.date_error_msg = f'\nDate Errors:\n'
date_value_str = str(date_value)

# Try matching against each pattern
match = None
for pattern in date_patterns:
match = re.match(pattern, date_value_str)
if match:
break
'''
if match:
print(f"1: {self.valid_date_flag}")
# Extract date components
if len(match.groups()) == 3: # Match for MMDDYYYY or MM-DD-YYYY
month = match.group(1)
day = match.group(2) if len(match.groups()) > 2 else "00"
year = match.group(3)
elif len(match.groups()) == 2: # Match for MMYYYY, MM-YYYY, MM/YYYY
month = match.group(1)
year = match.group(2)
else: # Match for YYYY, YYYY-MM, YYYY-MM-DD
year = match.group(1)
month = match.group(2) if match.group(2) else "00"
day = match.group(3) if match.group(3) else "00"
print(f"2: {self.valid_date_flag}")
'''
if match:
# Extract date components
year, month, day, *_ = match.groups()
if not month:
month = "00"
if not day:
day = "00"

# Check if the year is only two digits
if len(year) == 2:
raise ValueError("Two-digit year detected. Use a four-digit year for clarity.")

# Normalize month and day to two digits
month = month.zfill(2)
day = day.zfill(2)

# Handle 'v' flag for YYYY-MM-DD format
if self.parameters['date_format_flag'].lower() == 'v':
# Ensure date is in YYYY-MM-DD format
dates_list[i] = f"{year}-{month}-{day}"

# Handle 's' flag for YYYY-MM format
elif self.parameters['date_format_flag'].lower() == 's':
# Ensure date is in YYYY-MM format
dates_list[i] = f"{year}-{month}"

else:
print(f"3: {self.valid_date_flag}")
raise ValueError(f"Unknown date_format_flag {self.parameters['date_format_flag']}")
else:
print(f"4: {self.valid_date_flag}")
raise ValueError(f"Invalid date format {date_value_str}")

except Exception as e:
dates_holder['invalid_dates'].append(sample_name)
self.valid_date_flag = False
print(f"5: {self.valid_date_flag}")

# Handle output or logging for missing and invalid dates
with open("date_errors.txt", "w") as error_file:
if dates_holder['missing_dates']:
try:
assert len(dates_holder['missing_dates']) != 0
except AssertionError:
raise AssertionError(f'Recorded missing dates as empty but still passed conditional')
self.date_error_msg += f'Missing Dates: {", ".join(dates_holder["missing_dates"])}. '
error_file.write(f"Missing dates for samples: {dates_holder['missing_dates']}\n")
if dates_holder['invalid_dates']:
raise ValueError(f'Unable to convert date format according to passed in {self.parameters["date_format_flag"]} '
f'value for date_format_flag. Please confirm date is in right format and this flag was intended')
error_file.write(f"Invalid dates for samples: {dates_holder['invalid_dates']}\n")

if not self.valid_date_flag:
print(f"6: {self.valid_date_flag}")
raise ValueError("Date validation failed. Check 'date_errors.txt' for details.")

# place the modified date list into the dataframe
self.metadata_df['collection_date'] = dates_list
# Update the dataframe with formatted dates
self.metadata_df["collection_date"] = dates_list

@staticmethod
def check_authors(authors):
Expand Down Expand Up @@ -581,24 +619,30 @@ def check_meta_core(self, sample_line):
self.sample_error_msg += "\n\t\tMissing Required Metadata: " + ", ".join(missing_fields)
if len(missing_optionals) != 0:
self.sample_error_msg += "\n\t\tMissing Optional Metadata: " + ", ".join(missing_optionals)

def check_meta_case(self, sample_info):
""" Checks the case data for metadata (sex, age, race, and ethnicity) is not empty
"""
""" Checks and removes demographics metadata for cases (sex, age, race, and ethnicity) if present. """
try:
assert self.meta_case_grade is True
except AssertionError:
raise AssertionError(f'Meta case grade was not properly reset back to True after sample round')

invalid_case_data = []
for field in self.case_fields:
if str(sample_info[field]) != "" and str(sample_info[field]) != '':
invalid_case_data.append(field)
self.meta_case_grade = False
# develop the following error message if the field is empty
if self.meta_case_grade is False:
self.sample_error_msg += f'\n\t\tPresent Case Data found in: {", ".join(invalid_case_data)}' + \
f"\n\t\tValidation will Fail. Please remove Case Data or add the Keep Data Flag -f to Conserve Case Data"
try:
for field in self.case_fields:
if field in sample_info.columns and str(sample_info[field].values[0]) not in ["", None, "Not Provided"]:
invalid_case_data.append(field)
# Remove the case data from the dataframe
sample_info.at[sample_info.index[0], field] = "Not Provided" # Replace value with Not Provided string
except:
self.meta_case_grade = False
# Develop error message if case data was found and removed
if invalid_case_data:
self.sample_error_msg += (
f'\n\t\tPresent Case Data found in: {", ".join(invalid_case_data)}.'
f'\n\t\tThe case data has been removed automatically.'
)
self.metadata_df.update(sample_info)
return self.metadata_df

class Check_Illumina_Nanopore_SRA:
""" Class constructor for the various checks on instruments
Expand Down Expand Up @@ -793,7 +837,7 @@ def capture_errors_per_sample(self):
self.write_tsv_file(sample_passed)

def capture_final_error(self, final_error_file, repeat_error, matchup_error,
valid_date_flag, date_error_msg, case_data_detected, valid_sample_num, metadata_df,
valid_date_flag, date_error_msg, valid_sample_num, metadata_df,
list_of_sample_errors, repeated, did_validation_work):
""" Handles the final error message
"""
Expand All @@ -811,11 +855,6 @@ def capture_final_error(self, final_error_file, repeat_error, matchup_error,
did_validation_work = False
final_error += f"{date_error_msg}\n"

# write the case data error message
if case_data_detected is True:
did_validation_work = False
final_error += f'Keep Personal Info Flag is True But Case Data is Empty!'

final_error_file.write("General Errors:\n\n")
if final_error != '':
final_error_file.write(final_error)
Expand Down
4 changes: 2 additions & 2 deletions conf/test_params.config
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ params {
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
val_output_dir = 'validation_outputs'
val_date_format_flag = 's'
val_keep_pi = false
date_format_flag = 's' // s = default (YYYY-MM), v = verbose(YYYY-MM-DD), o = original (leaves format unchanged)
remove_demographic_info = false // if true, values in host_sex, host_age, race, ethnicity are set to 'Not Provided'
validate_custom_fields = false
custom_fields_file = "${projectDir}/assets/custom_meta_fields/example_custom_fields.json"

Expand Down
6 changes: 5 additions & 1 deletion modules/local/metadata_validation/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,17 @@ process METADATA_VALIDATION {
input:
path meta_path

def remove_demographic_info = params.remove_demographic_info == true ? '--remove_demographic_info' : ''
def validate_custom_fields = params.validate_custom_fields == true ? '--validate_custom_fields' : ''

script:
"""
validate_metadata.py \
--meta_path $meta_path \
--output_dir . \
--custom_fields_file $params.custom_fields_file \
--validate_custom_fields $params.validate_custom_fields
--date_format_flag $params.date_format_flag \
$remove_demographic_info $validate_custom_fields
"""

output:
Expand Down
4 changes: 2 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ params {
ref_gff_path = "${projectDir}/assets/ref/ref.MPXV.NC063383.v7.gff"

// validation subworkflow
val_date_format_flag = 's'
val_keep_pi = false
date_format_flag = 's' // s = default (YYYY-MM), v = verbose(YYYY-MM-DD), o = original (leaves format unchanged)
remove_demographic_info = false // if true, values in host_sex, host_age, race, ethnicity are set to 'Not Provided'
validate_custom_fields = false
custom_fields_file = "${projectDir}/assets/custom_meta_fields/example_custom_fields.json"

Expand Down
6 changes: 3 additions & 3 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,14 @@
"description": "File path for outputs specific to validate sub-workflow",
"default": "validation_outputs"
},
"val_date_format_flag": {
"date_format_flag": {
"type": "string",
"description": "Flag to change date output",
"default": "s"
},
"val_keep_pi": {
"keep_demographic_info": {
"type": "boolean",
"description": "Flag to keep personal identifying info, if provided otherwise it will return an error"
"description": "Flag that, if false, will remove any potentially identifying demographic data"
},
"validate_custom_fields": {
"type": "boolean",
Expand Down

0 comments on commit a614b35

Please sign in to comment.