diff --git a/eva_sub_cli/executables/cli.py b/eva_sub_cli/executables/cli.py index d834d73..c760de5 100755 --- a/eva_sub_cli/executables/cli.py +++ b/eva_sub_cli/executables/cli.py @@ -40,7 +40,7 @@ def main(): argparser = ArgumentParser(prog='eva-sub-cli', description='EVA Submission CLI - validate and submit data to EVA') argparser.add_argument('--version', action='version', version=f'%(prog)s {eva_sub_cli.__version__}') argparser.add_argument('--submission_dir', required=True, type=str, - help='Full path to the directory where all processing will be done ' + help='Path to the directory where all processing will be done ' 'and submission info is/will be stored') vcf_group = argparser.add_argument_group( 'Input VCF and assembly', @@ -57,7 +57,7 @@ def main(): help="Json file that describe the project, analysis, samples and files") metadata_group.add_argument("--metadata_xlsx", help="Excel spreadsheet that describe the project, analysis, samples and files") - argparser.add_argument('--tasks', nargs='*', choices=[VALIDATE, SUBMIT], default=[SUBMIT], type=str.lower, + argparser.add_argument('--tasks', nargs='+', choices=[VALIDATE, SUBMIT], default=[SUBMIT], type=str.lower, help='Select a task to perform. Selecting VALIDATE will run the validation regardless of the' ' outcome of previous runs. Selecting SUBMIT will run validate only if the validation' ' was not performed successfully before and then run the submission.') @@ -67,12 +67,18 @@ def main(): 'upload to the EVA') credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account") credential_group.add_argument("--password", help="Password used for connecting to the ENA webin account") + argparser.add_argument('--debug', action='store_true', default=False, help='Set the script to output debug messages') args = argparser.parse_args() validate_command_line_arguments(args, argparser) - logging_config.add_stdout_handler() + args.submission_dir = os.path.abspath(args.submission_dir) + + if args.debug: + logging_config.add_stdout_handler(logging.DEBUG) + else: + logging_config.add_stdout_handler() logging_config.add_file_handler(os.path.join(args.submission_dir, 'eva_submission.log'), logging.DEBUG) try: diff --git a/eva_sub_cli/executables/xlsx2json.py b/eva_sub_cli/executables/xlsx2json.py index d10960f..179c0d3 100644 --- a/eva_sub_cli/executables/xlsx2json.py +++ b/eva_sub_cli/executables/xlsx2json.py @@ -372,6 +372,6 @@ def main(): try: parser.json(args.metadata_json) except Exception as e: - parser.add_error(e) + parser.add_error(f'An Error was raised while converting the spreadsheet to JSON: {repr(e)}') finally: parser.save_errors(args.errors_yaml) diff --git a/eva_sub_cli/metadata_utils.py b/eva_sub_cli/metadata_utils.py index ce967ae..df7f436 100644 --- a/eva_sub_cli/metadata_utils.py +++ b/eva_sub_cli/metadata_utils.py @@ -22,8 +22,7 @@ def get_files_per_analysis(metadata): """Returns mapping of analysis alias to filenames, based on metadata.""" files_per_analysis = defaultdict(list) for file_info in metadata.get('files', []): - if file_info.get('fileType') == 'vcf': - files_per_analysis[file_info.get('analysisAlias')].append(file_info.get('fileName')) + files_per_analysis[file_info.get('analysisAlias')].append(file_info.get('fileName')) return { analysis_alias: set(filepaths) for analysis_alias, filepaths in files_per_analysis.items() diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf index a6253a5..1b28e06 100644 --- a/eva_sub_cli/nextflow/validation.nf +++ b/eva_sub_cli/nextflow/validation.nf @@ -195,12 +195,13 @@ process convert_xlsx_2_json { output: path "metadata.json", emit: metadata_json, optional: true path "metadata_conversion_errors.yml", emit: errors_yaml + path "xlsx2json.log", emit: xlsx2json_log script: metadata_json = metadata_xlsx.getBaseName() + '.json' """ - $params.python_scripts.xlsx2json --metadata_xlsx $metadata_xlsx --metadata_json metadata.json --errors_yaml metadata_conversion_errors.yml --conversion_configuration $conversion_configuration + $params.python_scripts.xlsx2json --metadata_xlsx $metadata_xlsx --metadata_json metadata.json --errors_yaml metadata_conversion_errors.yml --conversion_configuration $conversion_configuration > xlsx2json.log 2>&1 """ } @@ -217,7 +218,7 @@ process metadata_json_validation { script: """ - $params.executable.biovalidator --schema $schema_dir/eva_schema.json --ref $schema_dir/eva-biosamples.json --data $metadata_json > metadata_validation.txt + $params.executable.biovalidator --schema $schema_dir/eva_schema.json --ref $schema_dir/eva-biosamples.json --data $metadata_json > metadata_validation.txt 2>&1 """ } @@ -232,10 +233,11 @@ process sample_name_concordance { output: path "sample_checker.yml", emit: sample_checker_yml + path "sample_checker.log", emit: sample_checker_log script: """ - $params.python_scripts.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml + $params.python_scripts.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml > sample_checker.log 2>&1 """ } @@ -251,10 +253,11 @@ process insdc_checker { output: path "${fasta_file}_check.yml", emit: fasta_checker_yml + path "fasta_checker.log", emit: fasta_checker_log script: """ - $params.python_scripts.fasta_checker --metadata_json $metadata_json --vcf_files $vcf_files --input_fasta $fasta_file --output_yaml ${fasta_file}_check.yml + $params.python_scripts.fasta_checker --metadata_json $metadata_json --vcf_files $vcf_files --input_fasta $fasta_file --output_yaml ${fasta_file}_check.yml > fasta_checker.log 2>&1 """ } @@ -269,9 +272,10 @@ process metadata_semantic_check { output: path "metadata_semantic_check.yml", emit: metadata_semantic_check_yml + path "semantic_checker.log", emit: semantic_checker_log script: """ - $params.python_scripts.semantic_checker --metadata_json $metadata_json --output_yaml metadata_semantic_check.yml + $params.python_scripts.semantic_checker --metadata_json $metadata_json --output_yaml metadata_semantic_check.yml > semantic_checker.log 2>&1 """ } diff --git a/eva_sub_cli/orchestrator.py b/eva_sub_cli/orchestrator.py index 51d1de2..a14c31d 100755 --- a/eva_sub_cli/orchestrator.py +++ b/eva_sub_cli/orchestrator.py @@ -54,7 +54,7 @@ def get_project_title_and_create_vcf_files_mapping(submission_dir, vcf_files, re project_title, vcf_files_mapping = get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, True) for mapping in vcf_files_mapping: - writer.writerow(mapping); + writer.writerow(mapping) return project_title, mapping_file @@ -80,6 +80,7 @@ def get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json, mapping_ return project_title, vcf_fasta_report_mapping + def get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, mapping_req=False): workbook = load_workbook(metadata_xlsx) @@ -108,15 +109,19 @@ def get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, mapping_ files_headers[cell.value] = cell.column - 1 for row in files_sheet.iter_rows(min_row=2, values_only=True): - file_name = row[files_headers['File Name']] + file_name = os.path.abspath(row[files_headers['File Name']]) analysis_alias = row[files_headers['Analysis Alias']] - reference_fasta = analysis_alias_dict[analysis_alias] + reference_fasta = os.path.abspath(analysis_alias_dict[analysis_alias]) + if not (file_name and os.path.isfile(file_name)): + raise FileNotFoundError(f'The variant file {file_name} provided in spreadsheet {metadata_xlsx} does not exist') + if not (reference_fasta and os.path.isfile(reference_fasta)): + raise FileNotFoundError(f'The reference fasta {reference_fasta} in spreadsheet {metadata_xlsx} does not exist') vcf_fasta_report_mapping.append([os.path.abspath(file_name), os.path.abspath(reference_fasta), '']) return project_title, vcf_fasta_report_mapping -def check_validation_required(tasks, sub_config): +def check_validation_required(tasks, sub_config, username=None, password=None): # Validation is mandatory so if submit is requested then VALIDATE must have run before or be requested as well if SUBMIT in tasks: if not sub_config.get(READY_FOR_SUBMISSION_TO_EVA, False): @@ -124,7 +129,7 @@ def check_validation_required(tasks, sub_config): submission_id = sub_config.get(SUB_CLI_CONFIG_KEY_SUBMISSION_ID, None) if submission_id: try: - submission_status = SubmissionWSClient().get_submission_status(submission_id) + submission_status = SubmissionWSClient(username, password).get_submission_status(submission_id) if submission_status == 'FAILED': return True else: @@ -150,13 +155,20 @@ def orchestrate_process(submission_dir, vcf_files, reference_fasta, metadata_jso metadata_file = metadata_json or metadata_xlsx if not os.path.exists(os.path.abspath(metadata_file)): - raise FileNotFoundError(f'The provided metadata file {metadata_file} does not exist') + raise FileNotFoundError(f'The provided metadata file {os.path.abspath(metadata_file)} does not exist') + + if metadata_json: + metadata_json = os.path.abspath(metadata_json) + if metadata_xlsx: + metadata_xlsx = os.path.abspath(metadata_xlsx) # Get the provided Project Title and VCF files mapping (VCF, Fasta and Report) - project_title, vcf_files_mapping = get_project_title_and_create_vcf_files_mapping(submission_dir, vcf_files, reference_fasta, metadata_json, metadata_xlsx) + project_title, vcf_files_mapping = get_project_title_and_create_vcf_files_mapping( + submission_dir, vcf_files, reference_fasta, metadata_json, metadata_xlsx + ) vcf_files = get_vcf_files(vcf_files_mapping) - if VALIDATE not in tasks and check_validation_required(tasks, sub_config): + if VALIDATE not in tasks and check_validation_required(tasks, sub_config, username, password): tasks.append(VALIDATE) if VALIDATE in tasks: diff --git a/eva_sub_cli/submission_ws.py b/eva_sub_cli/submission_ws.py index bb7cd02..5512a51 100644 --- a/eva_sub_cli/submission_ws.py +++ b/eva_sub_cli/submission_ws.py @@ -43,19 +43,19 @@ def _submission_status_url(self, submission_id): def mark_submission_uploaded(self, submission_id, metadata_json): response = requests.put(self._submission_uploaded_url(submission_id), - headers={'Accept': 'application/hal+json', 'Authorization': 'Bearer ' + self.auth.token}, - data=metadata_json) + headers={'Accept': 'application/json', 'Authorization': 'Bearer ' + self.auth.token}, + json=metadata_json) response.raise_for_status() return response.json() def initiate_submission(self): - response = requests.post(self._submission_initiate_url(), headers={'Accept': 'application/hal+json', + response = requests.post(self._submission_initiate_url(), headers={'Accept': 'application/json', 'Authorization': 'Bearer ' + self.auth.token}) response.raise_for_status() return response.json() @retry(exceptions=(HTTPError,), tries=3, delay=2, backoff=1.2, jitter=(1, 3)) def get_submission_status(self, submission_id): - response = requests.get(self.get_submission_status_url(submission_id)) + response = requests.get(self._submission_status_url(submission_id)) response.raise_for_status() return response.text diff --git a/eva_sub_cli/submit.py b/eva_sub_cli/submit.py index 8f71128..4b7842c 100644 --- a/eva_sub_cli/submit.py +++ b/eva_sub_cli/submit.py @@ -64,7 +64,8 @@ def _upload_submission(self): def _upload_file(self, submission_upload_url, input_file): base_name = os.path.basename(input_file) self.debug(f'Transfer {base_name} to EVA FTP') - r = requests.put(os.path.join(submission_upload_url, base_name), data=open(input_file, 'rb')) + with open(input_file, 'rb') as f: + r = requests.put(os.path.join(submission_upload_url, base_name), data=f) r.raise_for_status() self.debug(f'Upload of {base_name} completed') diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py index fcf4051..6e2ec0b 100644 --- a/eva_sub_cli/validators/docker_validator.py +++ b/eva_sub_cli/validators/docker_validator.py @@ -12,7 +12,7 @@ logger = logging_config.get_logger(__name__) container_image = 'ebivariation/eva-sub-cli' -container_tag = 'v0.0.1.dev14' +container_tag = 'v0.0.1.dev15' container_validation_dir = '/opt/vcf_validation' container_validation_output_dir = 'vcf_validation_output' @@ -100,10 +100,10 @@ def verify_container_is_running(self): try: container_run_cmd_output = self._run_quiet_command("check if container is running", f"{self.docker_path} ps", return_process_output=True) if container_run_cmd_output is not None and self.container_name in container_run_cmd_output: - logger.info(f"Container ({self.container_name}) is running") + logger.debug(f"Container ({self.container_name}) is running") return True else: - logger.info(f"Container ({self.container_name}) is not running") + logger.debug(f"Container ({self.container_name}) is not running") return False except subprocess.CalledProcessError as ex: logger.error(ex) @@ -114,10 +114,10 @@ def verify_container_is_stopped(self): "check if container is stopped", f"{self.docker_path} ps -a", return_process_output=True ) if container_stop_cmd_output is not None and self.container_name in container_stop_cmd_output: - logger.info(f"Container ({self.container_name}) is in stop state") + logger.debug(f"Container ({self.container_name}) is in stop state") return True else: - logger.info(f"Container ({self.container_name}) is not in stop state") + logger.debug(f"Container ({self.container_name}) is not in stop state") return False def try_restarting_container(self): @@ -137,14 +137,14 @@ def verify_image_available_locally(self): return_process_output=True ) if container_images_cmd_ouptut is not None and re.search(container_image + r'\s+' + container_tag, container_images_cmd_ouptut): - logger.info(f"Container ({container_image}) image is available locally") + logger.debug(f"Container ({container_image}) image is available locally") return True else: - logger.info(f"Container ({container_image}) image is not available locally") + logger.debug(f"Container ({container_image}) image is not available locally") return False def run_container(self): - logger.info(f"Trying to run container {self.container_name}") + logger.debug(f"Trying to run container {self.container_name}") try: self._run_quiet_command( "Try running container", @@ -166,7 +166,7 @@ def stop_running_container(self): ) def download_container_image(self): - logger.info(f"Pulling container ({container_image}) image") + logger.debug(f"Pulling container ({container_image}) image") try: self._run_quiet_command("pull container image", f"{self.docker_path} pull {container_image}:{container_tag}") except subprocess.CalledProcessError as ex: diff --git a/eva_sub_cli/validators/native_validator.py b/eva_sub_cli/validators/native_validator.py index f125907..eb95939 100644 --- a/eva_sub_cli/validators/native_validator.py +++ b/eva_sub_cli/validators/native_validator.py @@ -24,11 +24,15 @@ def _validate(self): def run_validator(self): self.verify_executables_installed() + curr_wd = os.getcwd() try: command = self.get_validation_cmd() + os.chdir(self.submission_dir) self._run_quiet_command("Run Validation using Nextflow", command) except subprocess.CalledProcessError as ex: logger.error(ex) + finally: + os.chdir(curr_wd) def get_validation_cmd(self): if self.metadata_xlsx and not self.metadata_json: diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py index 5ffa833..c0c9681 100755 --- a/eva_sub_cli/validators/validator.py +++ b/eva_sub_cli/validators/validator.py @@ -545,6 +545,10 @@ def _collect_file_info_to_metadata(self): file_name_2_md5[os.path.basename(vcf_file)] = md5sum file_path_2_file_size[vcf_file] = file_size file_name_2_file_size[os.path.basename(vcf_file)] = file_size + else: + self.error( + f"Cannot locate file_info.txt at {os.path.join(self.output_dir, 'other_validations', 'file_info.txt')}" + ) if self.metadata_json_post_validation: with open(self.metadata_json_post_validation) as open_file: try: @@ -553,12 +557,11 @@ def _collect_file_info_to_metadata(self): files_from_metadata = json_data.get('files', []) if files_from_metadata: for file_dict in json_data.get('files', []): - if file_dict.get('fileType') == 'vcf': - file_path = self._validation_file_path_for(file_dict.get('fileName')) - file_dict['md5'] = file_path_2_md5.get(file_path) or \ - file_name_2_md5.get(file_dict.get('fileName')) or '' - file_dict['fileSize'] = file_path_2_file_size.get(file_path) or \ - file_name_2_file_size.get(file_dict.get('fileName')) or '' + file_path = self._validation_file_path_for(file_dict.get('fileName')) + file_dict['md5'] = file_path_2_md5.get(file_path) or \ + file_name_2_md5.get(file_dict.get('fileName')) or '' + file_dict['fileSize'] = file_path_2_file_size.get(file_path) or \ + file_name_2_file_size.get(file_dict.get('fileName')) or '' file_rows.append(file_dict) else: self.error('No file found in metadata and multiple analysis alias exist: ' @@ -570,6 +573,8 @@ def _collect_file_info_to_metadata(self): if json_data: with open(self.metadata_json_post_validation, 'w') as open_file: json.dump(json_data, open_file) + else: + self.error(f'Cannot locate the metadata in JSON format in {os.path.join(self.output_dir, "metadata.json")}') def get_vcf_fasta_analysis_mapping(self): vcf_fasta_analysis_mapping = [] diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 1d39590..b11a856 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -13,6 +13,7 @@ from eva_sub_cli.orchestrator import orchestrate_process, VALIDATE, SUBMIT, DOCKER, check_validation_required from eva_sub_cli.submit import SUB_CLI_CONFIG_KEY_SUBMISSION_ID from eva_sub_cli.validators.validator import READY_FOR_SUBMISSION_TO_EVA +from tests.test_utils import touch class TestOrchestrator(unittest.TestCase): @@ -32,9 +33,16 @@ def setUp(self) -> None: shutil.rmtree(self.test_sub_dir) os.makedirs(self.test_sub_dir) shutil.copy(os.path.join(self.resource_dir, 'EVA_Submission_test.json'), self.metadata_json) + shutil.copy(os.path.join(self.resource_dir, 'EVA_Submission_test.xlsx'), self.metadata_xlsx) + for file_name in ['example1.vcf.gz', 'example2.vcf', 'example3.vcf', 'GCA_000001405.27_fasta.fa']: + touch(os.path.join(self.test_sub_dir, file_name)) + self.curr_wd = os.getcwd() + os.chdir(self.test_sub_dir) def tearDown(self) -> None: - shutil.rmtree(self.test_sub_dir) + os.chdir(self.curr_wd) + if os.path.exists(self.test_sub_dir): + shutil.rmtree(self.test_sub_dir) def test_check_validation_required(self): tasks = ['submit'] @@ -150,7 +158,6 @@ def test_orchestrate_with_vcf_files(self): ) m_docker_validator().validate_and_report.assert_called_once_with() - def test_orchestrate_with_metadata_json_without_asm_report(self): with patch('eva_sub_cli.orchestrator.WritableConfig') as m_config, \ patch('eva_sub_cli.orchestrator.DockerValidator') as m_docker_validator: @@ -189,7 +196,6 @@ def test_orchestrate_with_metadata_json_with_asm_report(self): ) m_docker_validator().validate_and_report.assert_called_once_with() - def test_orchestrate_vcf_files_takes_precedence_over_metadata(self): shutil.copy(os.path.join(self.resource_dir, 'EVA_Submission_test_with_asm_report.json'), self.metadata_json) @@ -213,8 +219,6 @@ def test_orchestrate_vcf_files_takes_precedence_over_metadata(self): def test_orchestrate_with_metadata_xlsx(self): - shutil.copy(os.path.join(self.resource_dir, 'EVA_Submission_test.xlsx'), self.metadata_xlsx) - with patch('eva_sub_cli.orchestrator.WritableConfig') as m_config, \ patch('eva_sub_cli.orchestrator.DockerValidator') as m_docker_validator: orchestrate_process(self.test_sub_dir, None, None, None, self.metadata_xlsx, @@ -232,11 +236,13 @@ def test_orchestrate_with_metadata_xlsx(self): ) m_docker_validator().validate_and_report.assert_called_once_with() - def test_metadata_file_does_not_exist_error(self): with self.assertRaises(Exception) as context: - orchestrate_process(self.test_sub_dir, None, None, None, self.metadata_xlsx, - tasks=[VALIDATE], executor=DOCKER) - self.assertRegex(str(context.exception),r"The provided metadata file .*/resources/test_sub_dir/sub_metadata.xlsx does not exist") + orchestrate_process(self.test_sub_dir, None, None, None, 'Non_existing_metadata.xlsx', + tasks=[VALIDATE], executor=DOCKER) + self.assertRegex( + str(context.exception), + r"The provided metadata file .*/resources/test_sub_dir/Non_existing_metadata.xlsx does not exist" + ) diff --git a/tests/test_submit.py b/tests/test_submit.py index f3b9ac6..8c697f3 100644 --- a/tests/test_submit.py +++ b/tests/test_submit.py @@ -58,13 +58,12 @@ def test_submit(self): mock_post.assert_called_once_with( os.path.join(test_submission_ws_client.SUBMISSION_WS_URL, 'submission/initiate'), - headers={'Accept': 'application/hal+json', 'Authorization': 'Bearer a token'}) + headers={'Accept': 'application/json', 'Authorization': 'Bearer a token'}) mock_put.assert_called_once_with( os.path.join(test_submission_ws_client.SUBMISSION_WS_URL, 'submission/mock_submission_id/uploaded'), - headers={'Accept': 'application/hal+json', 'Authorization': 'Bearer a token'}, - data=self.metadata_json) - print(mock_put.mock_calls) + headers={'Accept': 'application/json', 'Authorization': 'Bearer a token'}, + json=self.metadata_json) def test_submit_with_config(self): mock_initiate_response = MagicMock() diff --git a/tests/test_utils.py b/tests/test_utils.py index 8ccf613..6cf26d2 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -7,3 +7,8 @@ def create_mapping_file(mapping_file, vcf_files, fasta_files, assembly_reports): writer.writerow(['vcf', 'fasta', 'report']) for vcf_file, fasta_file, assembly_reports in zip(vcf_files, fasta_files, assembly_reports): writer.writerow([vcf_file, fasta_file, assembly_reports]) + + +def touch(file_path, content=''): + with open(file_path, 'w') as open_file: + open_file.write(content) \ No newline at end of file