Skip to content

Commit

Permalink
fix the date format function
Browse files Browse the repository at this point in the history
  • Loading branch information
Jessica Rowell authored and Jessica Rowell committed Jan 18, 2025
1 parent d524a00 commit 3ab77d3
Showing 1 changed file with 95 additions and 53 deletions.
148 changes: 95 additions & 53 deletions bin/validate_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,73 +451,115 @@ def check_for_repeats_in_meta(self):

def check_date(self):
"""
Reformats the date ... expects YYYY or YYYY-MM-DD or YYYY-MM... if dates are empty then outputs error in txt file.
Otherwise it throws and assertion error and stops the program to tell user to check date formats and input flag
Reformats the date based on the input flag. Accepts date formats as YYYY, YYYY-MM, YYYY-MM-DD, MMDDYYYY, MMYYYY.
Flags dates with two-digit years (YY) as invalid.
Outputs error details in a text file for missing or invalid dates and raises an error if the flag is not valid.
"""
try:
assert self.valid_date_flag is True
except AssertionError:
raise AssertionError(f'Valid date flag is not proper default value of True')
raise AssertionError(f"Valid date flag is not properly set to the default value of True")

dates_list = self.metadata_df["collection_date"].tolist()
samples_list = self.metadata_df["sample_name"].tolist()
dates_holder = {'missing_dates': [], 'invalid_dates': []}

# based on input flag reformats the date
for i in range(len(dates_list)):
if dates_list[i] == "" or dates_list[i] == '' or dates_list[i] == None or dates_list[i] == []:
dates_holder['missing_dates'].append(samples_list[i])

# Updated regex patterns for different formats
date_patterns = [
# Matches different date forms, all patterns accept '-' or '/' separator, and all have optional time
r"^(\d{4})(?:[-/](\d{1,2}))?(?:[-/](\d{1,2}))?(?:\s(\d{2}):(\d{2}):(\d{2}))?$", # YYYY, YYYY/MM, YYYY/MM/DD, YYYY/M, YYYY/M/DD
r"^(\d{1,2})(?:[-/](\d{1,2}))?(?:[-/](\d{4}))?(?:\s(\d{2}):(\d{2}):(\d{2}))?$", # M/D/YYYY, MM/DD/YYYY
r"^(\d{1,2})(?:[-/](\d{4}))?(?:\s(\d{2}):(\d{2}):(\d{2}))?$", # M/YYYY, MM/YYYY
r"^(\d{1,2})(?:[-/](\d{1,2}))[-/](\d{2})(?:\s(\d{2}):(\d{2}):(\d{2}))?$" # M/D/YY, MM/DD/YY
]

print(f"flag: {self.parameters['date_format_flag']}")
for i, date_value in enumerate(dates_list):
sample_name = samples_list[i]

# Handle missing or empty dates
if not date_value:
dates_holder['missing_dates'].append(sample_name)
self.valid_date_flag = False
continue

# checks if the date is in y-m format and converts to y-m-d format (puts 00 at end?)
elif self.parameters['date_format_flag'].lower() == 'v' and str(dates_list[i]).count('-') == 1:
try:
dates_list[i] = f'{dates_list[i]}-00'
except:
dates_holder['invalid_dates'].append(samples_list[i])
self.valid_date_flag = False

# checks if the date is in y-m-d format and needs to be converted to y-m format (without day)
elif self.parameters['date_format_flag'].lower() == 's' and str(dates_list[i]).count('-') == 2:
try:
dates_list[i] = '-'.join(dates_list[i].split('-')[:1])
except:
dates_holder['invalid_dates'].append(samples_list[i])
self.valid_date_flag = False

elif str(dates_list[i]).count('-') == 0:
if self.parameters['date_format_flag'].lower() == 's':
try:
dates_list[i] = f'{dates_list[i]}-00'
except:
dates_holder['invalid_dates'].append(samples_list[i])
self.valid_date_flag = False
elif self.parameters['date_format_flag'].lower() == 'v':
try:
dates_list[i] = f'{dates_list[i]}-00-00'
except:
dates_holder['invalid_dates'].append(samples_list[i])
self.valid_date_flag = False

# output error messages collected
if self.valid_date_flag is False:
try:
assert True in [len(dates_holder['missing_dates']) != 0, len(dates_holder['invalid_dates']) != 0]
except AssertionError:
raise AssertionError(f'Valid date flag was set as false, but recorded missing or invalid dates was empty')
self.date_error_msg = f'\nDate Errors:\n'
date_value_str = str(date_value)

# Try matching against each pattern
match = None
for pattern in date_patterns:
match = re.match(pattern, date_value_str)
if match:
break
'''
if match:
print(f"1: {self.valid_date_flag}")
# Extract date components
if len(match.groups()) == 3: # Match for MMDDYYYY or MM-DD-YYYY
month = match.group(1)
day = match.group(2) if len(match.groups()) > 2 else "00"
year = match.group(3)
elif len(match.groups()) == 2: # Match for MMYYYY, MM-YYYY, MM/YYYY
month = match.group(1)
year = match.group(2)
else: # Match for YYYY, YYYY-MM, YYYY-MM-DD
year = match.group(1)
month = match.group(2) if match.group(2) else "00"
day = match.group(3) if match.group(3) else "00"
print(f"2: {self.valid_date_flag}")
'''
if match:
# Extract date components
year, month, day, *_ = match.groups()
if not month:
month = "00"
if not day:
day = "00"

# Check if the year is only two digits
if len(year) == 2:
raise ValueError("Two-digit year detected. Use a four-digit year for clarity.")

# Normalize month and day to two digits
month = month.zfill(2)
day = day.zfill(2)

# Handle 'v' flag for YYYY-MM-DD format
if self.parameters['date_format_flag'].lower() == 'v':
# Ensure date is in YYYY-MM-DD format
dates_list[i] = f"{year}-{month}-{day}"

# Handle 's' flag for YYYY-MM format
elif self.parameters['date_format_flag'].lower() == 's':
# Ensure date is in YYYY-MM format
dates_list[i] = f"{year}-{month}"

else:
print(f"3: {self.valid_date_flag}")
raise ValueError(f"Unknown date_format_flag {self.parameters['date_format_flag']}")
else:
print(f"4: {self.valid_date_flag}")
raise ValueError(f"Invalid date format {date_value_str}")

except Exception as e:
dates_holder['invalid_dates'].append(sample_name)
self.valid_date_flag = False
print(f"5: {self.valid_date_flag}")

# Handle output or logging for missing and invalid dates
with open("date_errors.txt", "w") as error_file:
if dates_holder['missing_dates']:
try:
assert len(dates_holder['missing_dates']) != 0
except AssertionError:
raise AssertionError(f'Recorded missing dates as empty but still passed conditional')
self.date_error_msg += f'Missing Dates: {", ".join(dates_holder["missing_dates"])}. '
error_file.write(f"Missing dates for samples: {dates_holder['missing_dates']}\n")
if dates_holder['invalid_dates']:
raise ValueError(f'Unable to convert date format according to passed in {self.parameters["date_format_flag"]} '
f'value for date_format_flag. Please confirm date is in right format and this flag was intended')
error_file.write(f"Invalid dates for samples: {dates_holder['invalid_dates']}\n")

if not self.valid_date_flag:
print(f"6: {self.valid_date_flag}")
raise ValueError("Date validation failed. Check 'date_errors.txt' for details.")

# place the modified date list into the dataframe
self.metadata_df['collection_date'] = dates_list
# Update the dataframe with formatted dates
self.metadata_df["collection_date"] = dates_list

@staticmethod
def check_authors(authors):
Expand Down

0 comments on commit 3ab77d3

Please sign in to comment.