Merge pull request #4 from gregdenay/main

Relaxed naming for fastq files
NRW-GEUBT · Oct 23, 2023 · 6be8480 · 6be8480
2 parents 51773d5 + 05be967
commit 6be8480
Show file tree

Hide file tree

Showing 13 changed files with 143 additions and 9 deletions.
diff --git a/.tests/integration/config/config.yaml b/.tests/integration/config/config.yaml
@@ -12,4 +12,4 @@ metadata: data/metadata.tsv
 max_threads_sample: 1
 
 # fastq naming scheme for BfR create_sample_sheet script ('illumina', 'ncbi', 'dot', 'flex')
-fastq_naming: illumina
+fastq_naming: flex
diff --git a/.tests/unit/consolidate_ids/data/metadata.tsv b/.tests/unit/consolidate_ids/data/metadata.tsv
@@ -0,0 +1,3 @@
+isolate_id	sample_id	organism	isolate_name_alt	isolation_org	sequencing_org	extraction_method	library_method	sequencing_instrument	bioinformatics_org	third_party_flag	third_party_owner	sample_type	collection_date	collection_municipality	collection_country	collection_cause	collected_by	manufacturer	designation	manufacturer_type	sample_description	lot_number
+2016-0000962-01	2016-0000962	Listeria monocytogenes		unknown	RRW				RRW	TRUE	BfR	unknown	01.01.2016	unknown	DE	lebensmittel	unknown		unknown	unknown	unkown	
+2016-0000962-02	2016-0000962	Listeria monocytogenes	16-LI00962-0	unknown	RRW				RRW	TRUE	BfR	unknown	01.01.2016	unknown	DE	lebensmittel	unknown		unknown	unknown	unkown	
diff --git a/.tests/unit/consolidate_ids/data/sample_sheet.tsv b/.tests/unit/consolidate_ids/data/sample_sheet.tsv
@@ -0,0 +1,3 @@
+sample	fq1	fq2
+2016-0000962-01	/home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/2016-0000962-01_S01_L001_R1_001.fastq.gz	/home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/2016-0000962-01_S01_L001_R2_001.fastq.gz
+16-LI00962-0	/home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/16-LI00962-0_S01_L001_R1_001.fastq.gz	/home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/16-LI00962-0_S01_L001_R2_001.fastq.gz
diff --git a/.tests/unit/consolidate_ids/expected/sample_sheet.tsv b/.tests/unit/consolidate_ids/expected/sample_sheet.tsv
@@ -0,0 +1,3 @@
+sample	fastq_name	fq1	fq2
+2016-0000962-01	2016-0000962-01	/home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/2016-0000962-01_S01_L001_R1_001.fastq.gz	/home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/2016-0000962-01_S01_L001_R2_001.fastq.gz
+2016-0000962-02	16-LI00962-0	/home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/16-LI00962-0_S01_L001_R1_001.fastq.gz	/home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/16-LI00962-0_S01_L001_R2_001.fastq.gz
diff --git a/.tests/unit/test_consolidate_ids.py b/.tests/unit/test_consolidate_ids.py
@@ -0,0 +1,35 @@
+import os
+import sys
+
+from tempfile import TemporaryDirectory
+import shutil
+import filecmp
+from pathlib import Path, PurePosixPath
+
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+
+def test_consolidate_ids():
+    with TemporaryDirectory() as tmpdir:
+        # Modify paths to link to your test data and script
+        workdir = os.path.join(Path(tmpdir), "workdir")
+        data_path = PurePosixPath(".tests/unit/consolidate_ids/data")
+        expected_path = PurePosixPath(".tests/unit/consolidate_ids/expected")
+        script_path = PurePosixPath(".tests/../workflow/scripts/consolidate_ids.py")
+
+        # Copy data to the temporary workdir.
+        shutil.copytree(data_path, workdir)
+        shutil.copy(script_path, workdir)
+
+        # run function
+        sys.path.insert(0, workdir)
+        from consolidate_ids import main  # import main from your script
+        main(
+            ssheet=os.path.join(workdir, "sample_sheet.tsv"),
+            metadata=os.path.join(workdir, "metadata.tsv"),
+            sheetout=os.path.join(workdir, 'result.tsv'),
+        )
+
+        # check that tables are same
+        assert filecmp.cmp(os.path.join(workdir, 'result.tsv'), os.path.join(expected_path, 'sample_sheet.tsv'))
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+### 0.2.0
+
+The workflow will expect fastq to be named after either the value of the `isolate_name_alt` if it is filled
+or the value of the `isolate_id` field if `isolate_name_alt` is empty. If a fastq has a name that is not in these fields,
+the workflow will stop with an error.
+
 ### 0.1.1
 
 Add missing metadata field "assembly_method" in metadata JSON for geuebt export

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.1.1
+0.2.0
diff --git a/docs/user_guide.md b/docs/user_guide.md
@@ -95,17 +95,21 @@ File naming should explicitely follow one the 4 schemes:
 This naming convention is used to generate a sample sheet with the AQUAMIS `create_sampleSheet` helper.
 See the documentation directly in the AQUAMIS repository.
 
+Note that the workflow will expect fastq to be named after either the value of the `isolate_name_alt` if it is filled
+or the value of the `isolate_id` field if `isolate_name_alt` is empty. If a fastq has a name that is not in these fields,
+the workflow will stop with an error.
+
 ## Metadata
 
 Metadata hould be provided as a table ina a flat text format, using tab separators.
 The table must contain following fields, not all of which must be filled:
 
 | Field  | Description | Required |
 |---|---|---|
-| isolate_id | Unique data identification in database | required |
+| isolate_id | Unique isolate identification in database | required |
 | sample_id | Unique sample identification for epidemiological analysis | required |
 | organims | Sample categorization for downstream analysis | required |
-| isolate_name_alt | Laboratory internal name | optional |
+| isolate_name_alt | Laboratory internal name (unique for each isolate) | optional |
 | isolation_org | Contact organisation for isolate | required |
 | sequencing_org | Contact organisation for sequencing | required |
 | extraction_method | Extraction method name | optional |

diff --git a/workflow/rules/aquamis.smk b/workflow/rules/aquamis.smk
@@ -23,9 +23,24 @@ rule create_sample_sheet:
         """
 
 
-checkpoint aquamis:
+rule consolidate_ids:
     input:
         sample_sheet="sample_sheet/samples.tsv",
+    params:
+        metadata=config["metadata"],
+    output:
+        sample_sheet="sample_sheet/samples_isolate_ids.tsv",
+    conda:
+        "../envs/pandas.yaml"
+    log:
+        "logs/consolidate_ids.log",
+    script:
+        "../scripts/consolidate_ids.py"
+
+
+checkpoint aquamis:
+    input:
+        sample_sheet="sample_sheet/samples_isolate_ids.tsv",
     output:
         outdir=directory("aquamis"),
         summary="aquamis/reports/summary_report.tsv",

diff --git a/workflow/rules/exports.smk b/workflow/rules/exports.smk
@@ -38,7 +38,7 @@ rule nrls_export:
     input:
         # using geuebt table here becaus it's already qc checked
         metadata="geuebt_export/metadata.tsv",
-        ssheet="sample_sheet/samples.tsv",
+        ssheet="sample_sheet/samples_isolate_ids.tsv",
     output:
         outdir=directory("nrls_export"),
         flag=touch("nrls_export/sucess.flag"),

diff --git a/workflow/scripts/consolidate_ids.py b/workflow/scripts/consolidate_ids.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+import sys
+
+
+# if not calling for snakemake rule
+try:
+    sys.stderr = open(snakemake.log[0], "w")
+except NameError:
+    pass
+
+
+import pandas as pd
+
+
+def main(ssheet, metadata, sheetout):
+    # load metadata and samplesheet
+    metatbl = pd.read_csv(metadata, sep="\t", index_col=False)
+    sample_sheet = pd.read_csv(ssheet, sep="\t", index_col="sample")
+    new_index = []
+    # for each fastq pair, check that there is a corresponding entry in metadata
+    for sname in sample_sheet.index.to_list():
+        selection = metatbl.loc[metatbl["isolate_name_alt"] == sname]
+        if len(selection) == 0:
+            selection = metatbl.loc[metatbl["isolate_id"] == sname]
+        if len(selection) == 0:
+            # Crash if name not found
+            raise KeyError(
+                f"There is not information on sample '{sname}' in the metadata table, "
+                f"althought valid FASTQs were provided. "
+                f"Ensure the completness of the submitted metadata. "
+                f"The workflow will expect fastq to be named after either the value "
+                f"the `isolate_id` field if `isolate_name_alt` is empty."
+            )
+        elif len(selection) > 1:
+            # crash if name not unique
+            raise not ValueError(
+                f"Several entries for sample name '{sname}' were found in the metadata table. "
+                f" Make sure that both the fields `isolate_id` and `isolate_name_alt` "
+                f"contain unique values"
+            )
+        else:
+            # add isolate id to reindexing
+            new_index.append(selection.iloc[0]["isolate_id"])
+
+    # resindex
+    sample_sheet.reset_index(inplace=True, names="fastq_name")
+    reindexed_sheet = sample_sheet.set_index(pd.Index(new_index))
+    reindexed_sheet.index.name = "sample"
+    # output
+    reindexed_sheet.to_csv(sheetout, sep="\t", header=True, index=True)
+
+
+if __name__ == "__main__":
+    main(
+        snakemake.input["sample_sheet"],
+        snakemake.params["metadata"],
+        snakemake.output["sample_sheet"],
+    )
diff --git a/workflow/scripts/geuebt_metadata.py b/workflow/scripts/geuebt_metadata.py
@@ -46,7 +46,9 @@ def main(assemblies, summary, metadata_in, metadata_out):
             raise KeyError(
                 f"There is not information on sample '{name}' in the metadata table, "
                 f"althought valid FASTQs were provided. "
-                f"Ensure the completness of the submitted metadata."
+                f"Ensure the completness of the submitted metadata. "
+                f"The workflow will expect fastq to be named after either the value "
+                f"the `isolate_id` field if `isolate_name_alt` is empty."
             )
         # iof QC fail skip sample
         if sumtbl.at[name, "QC_Vote"] == "FAIL":

diff --git a/workflow/scripts/nrls_export.py b/workflow/scripts/nrls_export.py
@@ -55,10 +55,12 @@ def main(metadata, ssheet, outdir):
         for row in tbl.iterrows():  # yields (index, Series)
             for fastqpath in (row[1]["fq1"], row[1]["fq2"]):
                 filename = os.path.basename(fastqpath)
+                # rename files with isolate_id
+                filename_id = filename.replace(row[1]["fastq_name"], row[0])
                 # copy fastq
-                shutil.copy(fastqpath, os.path.join(outdir, species, filename))
+                shutil.copy(fastqpath, os.path.join(outdir, species, filename_id))
                 # get checksum
-                checksums.append(f"{md5(fastqpath)}  {filename}")
+                checksums.append(f"{md5(fastqpath)}  {filename_id}")
             # create a one row df for metadata
             metanrl.append(pd.DataFrame.from_dict(
                 {row[0]: [