From bc269b745999cc478b58c66615293a0efa17a595 Mon Sep 17 00:00:00 2001
From: gregdenay <gregoire.denay@cvua-rrw.de>
Date: Mon, 23 Oct 2023 14:55:40 +0200
Subject: [PATCH 1/2] Will accept fastq using the isolae_name_alt naming

---
 .tests/integration/config/config.yaml         |  2 +-
 .tests/unit/consolidate_ids/data/metadata.tsv |  3 +
 .../consolidate_ids/data/sample_sheet.tsv     |  3 +
 .../consolidate_ids/expected/sample_sheet.tsv |  3 +
 .tests/unit/test_consolidate_ids.py           | 35 +++++++++++
 CHANGELOG.md                                  |  6 ++
 VERSION                                       |  2 +-
 docs/user_guide.md                            |  8 ++-
 workflow/rules/aquamis.smk                    | 17 ++++-
 workflow/rules/exports.smk                    |  2 +-
 workflow/scripts/consolidate_ids.py           | 63 +++++++++++++++++++
 workflow/scripts/geuebt_metadata.py           |  4 +-
 workflow/scripts/nrls_export.py               |  6 +-
 13 files changed, 145 insertions(+), 9 deletions(-)
 create mode 100644 .tests/unit/consolidate_ids/data/metadata.tsv
 create mode 100644 .tests/unit/consolidate_ids/data/sample_sheet.tsv
 create mode 100644 .tests/unit/consolidate_ids/expected/sample_sheet.tsv
 create mode 100644 .tests/unit/test_consolidate_ids.py
 create mode 100644 workflow/scripts/consolidate_ids.py

diff --git a/.tests/integration/config/config.yaml b/.tests/integration/config/config.yaml
index a796bd1..732964a 100644
--- a/.tests/integration/config/config.yaml
+++ b/.tests/integration/config/config.yaml
@@ -12,4 +12,4 @@ metadata: data/metadata.tsv
 max_threads_sample: 1
 
 # fastq naming scheme for BfR create_sample_sheet script ('illumina', 'ncbi', 'dot', 'flex')
-fastq_naming: illumina
+fastq_naming: flex
diff --git a/.tests/unit/consolidate_ids/data/metadata.tsv b/.tests/unit/consolidate_ids/data/metadata.tsv
new file mode 100644
index 0000000..049e5b9
--- /dev/null
+++ b/.tests/unit/consolidate_ids/data/metadata.tsv
@@ -0,0 +1,3 @@
+isolate_id	sample_id	organism	isolate_name_alt	isolation_org	sequencing_org	extraction_method	library_method	sequencing_instrument	bioinformatics_org	third_party_flag	third_party_owner	sample_type	collection_date	collection_municipality	collection_country	collection_cause	collected_by	manufacturer	designation	manufacturer_type	sample_description	lot_number
+2016-0000962-01	2016-0000962	Listeria monocytogenes		unknown	RRW				RRW	TRUE	BfR	unknown	01.01.2016	unknown	DE	lebensmittel	unknown		unknown	unknown	unkown	
+2016-0000962-02	2016-0000962	Listeria monocytogenes	16-LI00962-0	unknown	RRW				RRW	TRUE	BfR	unknown	01.01.2016	unknown	DE	lebensmittel	unknown		unknown	unknown	unkown	
diff --git a/.tests/unit/consolidate_ids/data/sample_sheet.tsv b/.tests/unit/consolidate_ids/data/sample_sheet.tsv
new file mode 100644
index 0000000..a523e6e
--- /dev/null
+++ b/.tests/unit/consolidate_ids/data/sample_sheet.tsv
@@ -0,0 +1,3 @@
+sample	fq1	fq2
+2016-0000962-01	/home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/2016-0000962-01_S01_L001_R1_001.fastq.gz	/home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/2016-0000962-01_S01_L001_R2_001.fastq.gz
+16-LI00962-0	/home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/16-LI00962-0_S01_L001_R1_001.fastq.gz	/home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/16-LI00962-0_S01_L001_R2_001.fastq.gz
diff --git a/.tests/unit/consolidate_ids/expected/sample_sheet.tsv b/.tests/unit/consolidate_ids/expected/sample_sheet.tsv
new file mode 100644
index 0000000..e16a94f
--- /dev/null
+++ b/.tests/unit/consolidate_ids/expected/sample_sheet.tsv
@@ -0,0 +1,3 @@
+sample	fastq_name	fq1	fq2
+2016-0000962-01	2016-0000962-01	/home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/2016-0000962-01_S01_L001_R1_001.fastq.gz	/home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/2016-0000962-01_S01_L001_R2_001.fastq.gz
+2016-0000962-02	16-LI00962-0	/home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/16-LI00962-0_S01_L001_R1_001.fastq.gz	/home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/16-LI00962-0_S01_L001_R2_001.fastq.gz
diff --git a/.tests/unit/test_consolidate_ids.py b/.tests/unit/test_consolidate_ids.py
new file mode 100644
index 0000000..628128c
--- /dev/null
+++ b/.tests/unit/test_consolidate_ids.py
@@ -0,0 +1,35 @@
+import os
+import sys
+
+from tempfile import TemporaryDirectory
+import shutil
+import filecmp
+from pathlib import Path, PurePosixPath
+
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+
+def test_consolidate_ids():
+    with TemporaryDirectory() as tmpdir:
+        # Modify paths to link to your test data and script
+        workdir = os.path.join(Path(tmpdir), "workdir")
+        data_path = PurePosixPath(".tests/unit/consolidate_ids/data")
+        expected_path = PurePosixPath(".tests/unit/consolidate_ids/expected")
+        script_path = PurePosixPath(".tests/../workflow/scripts/consolidate_ids.py")
+
+        # Copy data to the temporary workdir.
+        shutil.copytree(data_path, workdir)
+        shutil.copy(script_path, workdir)
+
+        # run function
+        sys.path.insert(0, workdir)
+        from consolidate_ids import main  # import main from your script
+        main(
+            ssheet=os.path.join(workdir, "sample_sheet.tsv"),
+            metadata=os.path.join(workdir, "metadata.tsv"),
+            sheetout=os.path.join(workdir, 'result.tsv'),
+        )
+
+        # check that tables are same
+        assert filecmp.cmp(os.path.join(workdir, 'result.tsv'), os.path.join(expected_path, 'sample_sheet.tsv'))
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3d140e8..466d50b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,9 @@
+### 0.2.0
+
+The workflow will expect fastq to be named after either the value of the `isolate_name_alt` if it is filled
+or the value of the `isolate_id` field if `isolate_name_alt` is empty. If a fastq has a name that is not in these fields,
+the workflow will stop with an error.
+
 ### 0.1.1
 
 Add missing metadata field "assembly_method" in metadata JSON for geuebt export
diff --git a/VERSION b/VERSION
index 6da28dd..341cf11 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.1.1
\ No newline at end of file
+0.2.0
\ No newline at end of file
diff --git a/docs/user_guide.md b/docs/user_guide.md
index 107ba37..8e7a832 100644
--- a/docs/user_guide.md
+++ b/docs/user_guide.md
@@ -95,6 +95,10 @@ File naming should explicitely follow one the 4 schemes:
 This naming convention is used to generate a sample sheet with the AQUAMIS `create_sampleSheet` helper.
 See the documentation directly in the AQUAMIS repository.
 
+Note that the workflow will expect fastq to be named after either the value of the `isolate_name_alt` if it is filled
+or the value of the `isolate_id` field if `isolate_name_alt` is empty. If a fastq has a name that is not in these fields,
+the workflow will stop with an error.
+
 ## Metadata
 
 Metadata hould be provided as a table ina a flat text format, using tab separators.
@@ -102,10 +106,10 @@ The table must contain following fields, not all of which must be filled:
 
 | Field  | Description | Required |
 |---|---|---|
-| isolate_id | Unique data identification in database | required |
+| isolate_id | Unique isolate identification in database | required |
 | sample_id | Unique sample identification for epidemiological analysis | required |
 | organims | Sample categorization for downstream analysis | required |
-| isolate_name_alt | Laboratory internal name | optional |
+| isolate_name_alt | Laboratory internal name (unique for each isolate) | optional |
 | isolation_org | Contact organisation for isolate | required |
 | sequencing_org | Contact organisation for sequencing | required |
 | extraction_method | Extraction method name | optional |
diff --git a/workflow/rules/aquamis.smk b/workflow/rules/aquamis.smk
index 0794f62..8aeeec3 100644
--- a/workflow/rules/aquamis.smk
+++ b/workflow/rules/aquamis.smk
@@ -23,9 +23,24 @@ rule create_sample_sheet:
         """
 
 
-checkpoint aquamis:
+rule consolidate_ids:
     input:
         sample_sheet="sample_sheet/samples.tsv",
+    params:
+        metadata=config["metadata"],
+    output:
+        sample_sheet="sample_sheet/samples_isolate_ids.tsv",
+    conda:
+        "../envs/pandas.yaml",
+    log:
+        "logs/consolidate_ids.log",
+    script:
+        "../scripts/consolidate_ids.py"
+
+
+checkpoint aquamis:
+    input:
+        sample_sheet="sample_sheet/samples_isolate_ids.tsv",
     output:
         outdir=directory("aquamis"),
         summary="aquamis/reports/summary_report.tsv",
diff --git a/workflow/rules/exports.smk b/workflow/rules/exports.smk
index 3aa8df5..fd4d542 100644
--- a/workflow/rules/exports.smk
+++ b/workflow/rules/exports.smk
@@ -38,7 +38,7 @@ rule nrls_export:
     input:
         # using geuebt table here becaus it's already qc checked
         metadata="geuebt_export/metadata.tsv",
-        ssheet="sample_sheet/samples.tsv",
+        ssheet="sample_sheet/samples_isolate_ids.tsv",
     output:
         outdir=directory("nrls_export"),
         flag=touch("nrls_export/sucess.flag"),
diff --git a/workflow/scripts/consolidate_ids.py b/workflow/scripts/consolidate_ids.py
new file mode 100644
index 0000000..67757b5
--- /dev/null
+++ b/workflow/scripts/consolidate_ids.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+import sys
+
+
+# if not calling for snakemake rule
+try:
+    sys.stderr = open(snakemake.log[0], "w")
+except NameError:
+    pass
+
+
+import os
+from pathlib import Path
+import pandas as pd
+
+
+def main(ssheet, metadata, sheetout):
+    # load metadata and samplesheet
+    metatbl = pd.read_csv(metadata, sep="\t", index_col=False)
+    sample_sheet = pd.read_csv(ssheet, sep="\t", index_col="sample")
+    new_index = []
+    # for each fastq pair, check that there is a corresponding entry in metadata
+    for sname in sample_sheet.index.to_list():
+        selection = metatbl.loc[metatbl["isolate_name_alt"] == sname]
+        if len(selection) == 0:
+            selection = metatbl.loc[metatbl["isolate_id"] == sname]
+        if len(selection) == 0:
+            # Crash if name not found
+            raise KeyError(
+                f"There is not information on sample '{sname}' in the metadata table, "
+                f"althought valid FASTQs were provided. "
+                f"Ensure the completness of the submitted metadata. "
+                f"The workflow will expect fastq to be named after either the value "
+                f"the `isolate_id` field if `isolate_name_alt` is empty."
+            )
+        elif len(selection) > 1:
+            # crash if name not unique
+            raise not ValueError(
+                f"Several entries for sample name '{sname}' were found in the metadata table. "
+                f" Make sure that both the fields `isolate_id` and `isolate_name_alt` "
+                f"contain unique values"
+            )
+        else:
+            # add isolate id to reindexing
+            new_index.append(selection.iloc[0]["isolate_id"])
+
+    # resindex
+    sample_sheet.reset_index(inplace=True, names="fastq_name")
+    reindexed_sheet = sample_sheet.set_index(pd.Index(new_index))
+    reindexed_sheet.index.name = "sample"
+    # output
+    reindexed_sheet.to_csv(sheetout, sep="\t", header=True, index=True)
+
+
+if __name__ == "__main__":
+    main(
+        snakemake.input["sample_sheet"],
+        snakemake.params["metadata"],
+        snakemake.output["sample_sheet"],
+    )
diff --git a/workflow/scripts/geuebt_metadata.py b/workflow/scripts/geuebt_metadata.py
index f1f0031..283f8e1 100644
--- a/workflow/scripts/geuebt_metadata.py
+++ b/workflow/scripts/geuebt_metadata.py
@@ -46,7 +46,9 @@ def main(assemblies, summary, metadata_in, metadata_out):
             raise KeyError(
                 f"There is not information on sample '{name}' in the metadata table, "
                 f"althought valid FASTQs were provided. "
-                f"Ensure the completness of the submitted metadata."
+                f"Ensure the completness of the submitted metadata. "
+                f"The workflow will expect fastq to be named after either the value "
+                f"the `isolate_id` field if `isolate_name_alt` is empty."
             )
         # iof QC fail skip sample
         if sumtbl.at[name, "QC_Vote"] == "FAIL":
diff --git a/workflow/scripts/nrls_export.py b/workflow/scripts/nrls_export.py
index 8d5e469..5326525 100644
--- a/workflow/scripts/nrls_export.py
+++ b/workflow/scripts/nrls_export.py
@@ -55,10 +55,12 @@ def main(metadata, ssheet, outdir):
         for row in tbl.iterrows():  # yields (index, Series)
             for fastqpath in (row[1]["fq1"], row[1]["fq2"]):
                 filename = os.path.basename(fastqpath)
+                # rename files with isolate_id
+                filename_id = filename.replace(row[1]["fastq_name"], row[0])
                 # copy fastq
-                shutil.copy(fastqpath, os.path.join(outdir, species, filename))
+                shutil.copy(fastqpath, os.path.join(outdir, species, filename_id))
                 # get checksum
-                checksums.append(f"{md5(fastqpath)}  {filename}")
+                checksums.append(f"{md5(fastqpath)}  {filename_id}")
             # create a one row df for metadata
             metanrl.append(pd.DataFrame.from_dict(
                 {row[0]: [

From 05be9676021b3b3f924baa28c5a15e104770b249 Mon Sep 17 00:00:00 2001
From: gregdenay <gregoire.denay@cvua-rrw.de>
Date: Mon, 23 Oct 2023 15:02:44 +0200
Subject: [PATCH 2/2] linter

---
 workflow/rules/aquamis.smk          | 2 +-
 workflow/scripts/consolidate_ids.py | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/workflow/rules/aquamis.smk b/workflow/rules/aquamis.smk
index 8aeeec3..68c88b7 100644
--- a/workflow/rules/aquamis.smk
+++ b/workflow/rules/aquamis.smk
@@ -31,7 +31,7 @@ rule consolidate_ids:
     output:
         sample_sheet="sample_sheet/samples_isolate_ids.tsv",
     conda:
-        "../envs/pandas.yaml",
+        "../envs/pandas.yaml"
     log:
         "logs/consolidate_ids.log",
     script:
diff --git a/workflow/scripts/consolidate_ids.py b/workflow/scripts/consolidate_ids.py
index 67757b5..fb9b8f5 100644
--- a/workflow/scripts/consolidate_ids.py
+++ b/workflow/scripts/consolidate_ids.py
@@ -12,8 +12,6 @@
     pass
 
 
-import os
-from pathlib import Path
 import pandas as pd