From bc269b745999cc478b58c66615293a0efa17a595 Mon Sep 17 00:00:00 2001 From: gregdenay Date: Mon, 23 Oct 2023 14:55:40 +0200 Subject: [PATCH 1/2] Will accept fastq using the isolae_name_alt naming --- .tests/integration/config/config.yaml | 2 +- .tests/unit/consolidate_ids/data/metadata.tsv | 3 + .../consolidate_ids/data/sample_sheet.tsv | 3 + .../consolidate_ids/expected/sample_sheet.tsv | 3 + .tests/unit/test_consolidate_ids.py | 35 +++++++++++ CHANGELOG.md | 6 ++ VERSION | 2 +- docs/user_guide.md | 8 ++- workflow/rules/aquamis.smk | 17 ++++- workflow/rules/exports.smk | 2 +- workflow/scripts/consolidate_ids.py | 63 +++++++++++++++++++ workflow/scripts/geuebt_metadata.py | 4 +- workflow/scripts/nrls_export.py | 6 +- 13 files changed, 145 insertions(+), 9 deletions(-) create mode 100644 .tests/unit/consolidate_ids/data/metadata.tsv create mode 100644 .tests/unit/consolidate_ids/data/sample_sheet.tsv create mode 100644 .tests/unit/consolidate_ids/expected/sample_sheet.tsv create mode 100644 .tests/unit/test_consolidate_ids.py create mode 100644 workflow/scripts/consolidate_ids.py diff --git a/.tests/integration/config/config.yaml b/.tests/integration/config/config.yaml index a796bd1..732964a 100644 --- a/.tests/integration/config/config.yaml +++ b/.tests/integration/config/config.yaml @@ -12,4 +12,4 @@ metadata: data/metadata.tsv max_threads_sample: 1 # fastq naming scheme for BfR create_sample_sheet script ('illumina', 'ncbi', 'dot', 'flex') -fastq_naming: illumina +fastq_naming: flex diff --git a/.tests/unit/consolidate_ids/data/metadata.tsv b/.tests/unit/consolidate_ids/data/metadata.tsv new file mode 100644 index 0000000..049e5b9 --- /dev/null +++ b/.tests/unit/consolidate_ids/data/metadata.tsv @@ -0,0 +1,3 @@ +isolate_id sample_id organism isolate_name_alt isolation_org sequencing_org extraction_method library_method sequencing_instrument bioinformatics_org third_party_flag third_party_owner sample_type collection_date collection_municipality collection_country collection_cause collected_by manufacturer designation manufacturer_type sample_description lot_number +2016-0000962-01 2016-0000962 Listeria monocytogenes unknown RRW RRW TRUE BfR unknown 01.01.2016 unknown DE lebensmittel unknown unknown unknown unkown +2016-0000962-02 2016-0000962 Listeria monocytogenes 16-LI00962-0 unknown RRW RRW TRUE BfR unknown 01.01.2016 unknown DE lebensmittel unknown unknown unknown unkown diff --git a/.tests/unit/consolidate_ids/data/sample_sheet.tsv b/.tests/unit/consolidate_ids/data/sample_sheet.tsv new file mode 100644 index 0000000..a523e6e --- /dev/null +++ b/.tests/unit/consolidate_ids/data/sample_sheet.tsv @@ -0,0 +1,3 @@ +sample fq1 fq2 +2016-0000962-01 /home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/2016-0000962-01_S01_L001_R1_001.fastq.gz /home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/2016-0000962-01_S01_L001_R2_001.fastq.gz +16-LI00962-0 /home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/16-LI00962-0_S01_L001_R1_001.fastq.gz /home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/16-LI00962-0_S01_L001_R2_001.fastq.gz diff --git a/.tests/unit/consolidate_ids/expected/sample_sheet.tsv b/.tests/unit/consolidate_ids/expected/sample_sheet.tsv new file mode 100644 index 0000000..e16a94f --- /dev/null +++ b/.tests/unit/consolidate_ids/expected/sample_sheet.tsv @@ -0,0 +1,3 @@ +sample fastq_name fq1 fq2 +2016-0000962-01 2016-0000962-01 /home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/2016-0000962-01_S01_L001_R1_001.fastq.gz /home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/2016-0000962-01_S01_L001_R2_001.fastq.gz +2016-0000962-02 16-LI00962-0 /home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/16-LI00962-0_S01_L001_R1_001.fastq.gz /home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/16-LI00962-0_S01_L001_R2_001.fastq.gz diff --git a/.tests/unit/test_consolidate_ids.py b/.tests/unit/test_consolidate_ids.py new file mode 100644 index 0000000..628128c --- /dev/null +++ b/.tests/unit/test_consolidate_ids.py @@ -0,0 +1,35 @@ +import os +import sys + +from tempfile import TemporaryDirectory +import shutil +import filecmp +from pathlib import Path, PurePosixPath + + +sys.path.insert(0, os.path.dirname(__file__)) + + +def test_consolidate_ids(): + with TemporaryDirectory() as tmpdir: + # Modify paths to link to your test data and script + workdir = os.path.join(Path(tmpdir), "workdir") + data_path = PurePosixPath(".tests/unit/consolidate_ids/data") + expected_path = PurePosixPath(".tests/unit/consolidate_ids/expected") + script_path = PurePosixPath(".tests/../workflow/scripts/consolidate_ids.py") + + # Copy data to the temporary workdir. + shutil.copytree(data_path, workdir) + shutil.copy(script_path, workdir) + + # run function + sys.path.insert(0, workdir) + from consolidate_ids import main # import main from your script + main( + ssheet=os.path.join(workdir, "sample_sheet.tsv"), + metadata=os.path.join(workdir, "metadata.tsv"), + sheetout=os.path.join(workdir, 'result.tsv'), + ) + + # check that tables are same + assert filecmp.cmp(os.path.join(workdir, 'result.tsv'), os.path.join(expected_path, 'sample_sheet.tsv')) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d140e8..466d50b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +### 0.2.0 + +The workflow will expect fastq to be named after either the value of the `isolate_name_alt` if it is filled +or the value of the `isolate_id` field if `isolate_name_alt` is empty. If a fastq has a name that is not in these fields, +the workflow will stop with an error. + ### 0.1.1 Add missing metadata field "assembly_method" in metadata JSON for geuebt export diff --git a/VERSION b/VERSION index 6da28dd..341cf11 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.1 \ No newline at end of file +0.2.0 \ No newline at end of file diff --git a/docs/user_guide.md b/docs/user_guide.md index 107ba37..8e7a832 100644 --- a/docs/user_guide.md +++ b/docs/user_guide.md @@ -95,6 +95,10 @@ File naming should explicitely follow one the 4 schemes: This naming convention is used to generate a sample sheet with the AQUAMIS `create_sampleSheet` helper. See the documentation directly in the AQUAMIS repository. +Note that the workflow will expect fastq to be named after either the value of the `isolate_name_alt` if it is filled +or the value of the `isolate_id` field if `isolate_name_alt` is empty. If a fastq has a name that is not in these fields, +the workflow will stop with an error. + ## Metadata Metadata hould be provided as a table ina a flat text format, using tab separators. @@ -102,10 +106,10 @@ The table must contain following fields, not all of which must be filled: | Field | Description | Required | |---|---|---| -| isolate_id | Unique data identification in database | required | +| isolate_id | Unique isolate identification in database | required | | sample_id | Unique sample identification for epidemiological analysis | required | | organims | Sample categorization for downstream analysis | required | -| isolate_name_alt | Laboratory internal name | optional | +| isolate_name_alt | Laboratory internal name (unique for each isolate) | optional | | isolation_org | Contact organisation for isolate | required | | sequencing_org | Contact organisation for sequencing | required | | extraction_method | Extraction method name | optional | diff --git a/workflow/rules/aquamis.smk b/workflow/rules/aquamis.smk index 0794f62..8aeeec3 100644 --- a/workflow/rules/aquamis.smk +++ b/workflow/rules/aquamis.smk @@ -23,9 +23,24 @@ rule create_sample_sheet: """ -checkpoint aquamis: +rule consolidate_ids: input: sample_sheet="sample_sheet/samples.tsv", + params: + metadata=config["metadata"], + output: + sample_sheet="sample_sheet/samples_isolate_ids.tsv", + conda: + "../envs/pandas.yaml", + log: + "logs/consolidate_ids.log", + script: + "../scripts/consolidate_ids.py" + + +checkpoint aquamis: + input: + sample_sheet="sample_sheet/samples_isolate_ids.tsv", output: outdir=directory("aquamis"), summary="aquamis/reports/summary_report.tsv", diff --git a/workflow/rules/exports.smk b/workflow/rules/exports.smk index 3aa8df5..fd4d542 100644 --- a/workflow/rules/exports.smk +++ b/workflow/rules/exports.smk @@ -38,7 +38,7 @@ rule nrls_export: input: # using geuebt table here becaus it's already qc checked metadata="geuebt_export/metadata.tsv", - ssheet="sample_sheet/samples.tsv", + ssheet="sample_sheet/samples_isolate_ids.tsv", output: outdir=directory("nrls_export"), flag=touch("nrls_export/sucess.flag"), diff --git a/workflow/scripts/consolidate_ids.py b/workflow/scripts/consolidate_ids.py new file mode 100644 index 0000000..67757b5 --- /dev/null +++ b/workflow/scripts/consolidate_ids.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +import sys + + +# if not calling for snakemake rule +try: + sys.stderr = open(snakemake.log[0], "w") +except NameError: + pass + + +import os +from pathlib import Path +import pandas as pd + + +def main(ssheet, metadata, sheetout): + # load metadata and samplesheet + metatbl = pd.read_csv(metadata, sep="\t", index_col=False) + sample_sheet = pd.read_csv(ssheet, sep="\t", index_col="sample") + new_index = [] + # for each fastq pair, check that there is a corresponding entry in metadata + for sname in sample_sheet.index.to_list(): + selection = metatbl.loc[metatbl["isolate_name_alt"] == sname] + if len(selection) == 0: + selection = metatbl.loc[metatbl["isolate_id"] == sname] + if len(selection) == 0: + # Crash if name not found + raise KeyError( + f"There is not information on sample '{sname}' in the metadata table, " + f"althought valid FASTQs were provided. " + f"Ensure the completness of the submitted metadata. " + f"The workflow will expect fastq to be named after either the value " + f"the `isolate_id` field if `isolate_name_alt` is empty." + ) + elif len(selection) > 1: + # crash if name not unique + raise not ValueError( + f"Several entries for sample name '{sname}' were found in the metadata table. " + f" Make sure that both the fields `isolate_id` and `isolate_name_alt` " + f"contain unique values" + ) + else: + # add isolate id to reindexing + new_index.append(selection.iloc[0]["isolate_id"]) + + # resindex + sample_sheet.reset_index(inplace=True, names="fastq_name") + reindexed_sheet = sample_sheet.set_index(pd.Index(new_index)) + reindexed_sheet.index.name = "sample" + # output + reindexed_sheet.to_csv(sheetout, sep="\t", header=True, index=True) + + +if __name__ == "__main__": + main( + snakemake.input["sample_sheet"], + snakemake.params["metadata"], + snakemake.output["sample_sheet"], + ) diff --git a/workflow/scripts/geuebt_metadata.py b/workflow/scripts/geuebt_metadata.py index f1f0031..283f8e1 100644 --- a/workflow/scripts/geuebt_metadata.py +++ b/workflow/scripts/geuebt_metadata.py @@ -46,7 +46,9 @@ def main(assemblies, summary, metadata_in, metadata_out): raise KeyError( f"There is not information on sample '{name}' in the metadata table, " f"althought valid FASTQs were provided. " - f"Ensure the completness of the submitted metadata." + f"Ensure the completness of the submitted metadata. " + f"The workflow will expect fastq to be named after either the value " + f"the `isolate_id` field if `isolate_name_alt` is empty." ) # iof QC fail skip sample if sumtbl.at[name, "QC_Vote"] == "FAIL": diff --git a/workflow/scripts/nrls_export.py b/workflow/scripts/nrls_export.py index 8d5e469..5326525 100644 --- a/workflow/scripts/nrls_export.py +++ b/workflow/scripts/nrls_export.py @@ -55,10 +55,12 @@ def main(metadata, ssheet, outdir): for row in tbl.iterrows(): # yields (index, Series) for fastqpath in (row[1]["fq1"], row[1]["fq2"]): filename = os.path.basename(fastqpath) + # rename files with isolate_id + filename_id = filename.replace(row[1]["fastq_name"], row[0]) # copy fastq - shutil.copy(fastqpath, os.path.join(outdir, species, filename)) + shutil.copy(fastqpath, os.path.join(outdir, species, filename_id)) # get checksum - checksums.append(f"{md5(fastqpath)} {filename}") + checksums.append(f"{md5(fastqpath)} {filename_id}") # create a one row df for metadata metanrl.append(pd.DataFrame.from_dict( {row[0]: [ From 05be9676021b3b3f924baa28c5a15e104770b249 Mon Sep 17 00:00:00 2001 From: gregdenay Date: Mon, 23 Oct 2023 15:02:44 +0200 Subject: [PATCH 2/2] linter --- workflow/rules/aquamis.smk | 2 +- workflow/scripts/consolidate_ids.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/workflow/rules/aquamis.smk b/workflow/rules/aquamis.smk index 8aeeec3..68c88b7 100644 --- a/workflow/rules/aquamis.smk +++ b/workflow/rules/aquamis.smk @@ -31,7 +31,7 @@ rule consolidate_ids: output: sample_sheet="sample_sheet/samples_isolate_ids.tsv", conda: - "../envs/pandas.yaml", + "../envs/pandas.yaml" log: "logs/consolidate_ids.log", script: diff --git a/workflow/scripts/consolidate_ids.py b/workflow/scripts/consolidate_ids.py index 67757b5..fb9b8f5 100644 --- a/workflow/scripts/consolidate_ids.py +++ b/workflow/scripts/consolidate_ids.py @@ -12,8 +12,6 @@ pass -import os -from pathlib import Path import pandas as pd