Skip to content

Commit

Permalink
Merge pull request #4 from gregdenay/main
Browse files Browse the repository at this point in the history
Relaxed naming for fastq files
  • Loading branch information
gregdenay authored Oct 23, 2023
2 parents 51773d5 + 05be967 commit 6be8480
Show file tree
Hide file tree
Showing 13 changed files with 143 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .tests/integration/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ metadata: data/metadata.tsv
max_threads_sample: 1

# fastq naming scheme for BfR create_sample_sheet script ('illumina', 'ncbi', 'dot', 'flex')
fastq_naming: illumina
fastq_naming: flex
3 changes: 3 additions & 0 deletions .tests/unit/consolidate_ids/data/metadata.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
isolate_id sample_id organism isolate_name_alt isolation_org sequencing_org extraction_method library_method sequencing_instrument bioinformatics_org third_party_flag third_party_owner sample_type collection_date collection_municipality collection_country collection_cause collected_by manufacturer designation manufacturer_type sample_description lot_number
2016-0000962-01 2016-0000962 Listeria monocytogenes unknown RRW RRW TRUE BfR unknown 01.01.2016 unknown DE lebensmittel unknown unknown unknown unkown
2016-0000962-02 2016-0000962 Listeria monocytogenes 16-LI00962-0 unknown RRW RRW TRUE BfR unknown 01.01.2016 unknown DE lebensmittel unknown unknown unknown unkown
3 changes: 3 additions & 0 deletions .tests/unit/consolidate_ids/data/sample_sheet.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample fq1 fq2
2016-0000962-01 /home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/2016-0000962-01_S01_L001_R1_001.fastq.gz /home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/2016-0000962-01_S01_L001_R2_001.fastq.gz
16-LI00962-0 /home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/16-LI00962-0_S01_L001_R1_001.fastq.gz /home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/16-LI00962-0_S01_L001_R2_001.fastq.gz
3 changes: 3 additions & 0 deletions .tests/unit/consolidate_ids/expected/sample_sheet.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample fastq_name fq1 fq2
2016-0000962-01 2016-0000962-01 /home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/2016-0000962-01_S01_L001_R1_001.fastq.gz /home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/2016-0000962-01_S01_L001_R2_001.fastq.gz
2016-0000962-02 16-LI00962-0 /home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/16-LI00962-0_S01_L001_R1_001.fastq.gz /home/debian/NGS/NRW-geuebt/local-assembler/.tests/integration/data/16-LI00962-0_S01_L001_R2_001.fastq.gz
35 changes: 35 additions & 0 deletions .tests/unit/test_consolidate_ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import os
import sys

from tempfile import TemporaryDirectory
import shutil
import filecmp
from pathlib import Path, PurePosixPath


sys.path.insert(0, os.path.dirname(__file__))


def test_consolidate_ids():
with TemporaryDirectory() as tmpdir:
# Modify paths to link to your test data and script
workdir = os.path.join(Path(tmpdir), "workdir")
data_path = PurePosixPath(".tests/unit/consolidate_ids/data")
expected_path = PurePosixPath(".tests/unit/consolidate_ids/expected")
script_path = PurePosixPath(".tests/../workflow/scripts/consolidate_ids.py")

# Copy data to the temporary workdir.
shutil.copytree(data_path, workdir)
shutil.copy(script_path, workdir)

# run function
sys.path.insert(0, workdir)
from consolidate_ids import main # import main from your script
main(
ssheet=os.path.join(workdir, "sample_sheet.tsv"),
metadata=os.path.join(workdir, "metadata.tsv"),
sheetout=os.path.join(workdir, 'result.tsv'),
)

# check that tables are same
assert filecmp.cmp(os.path.join(workdir, 'result.tsv'), os.path.join(expected_path, 'sample_sheet.tsv'))
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
### 0.2.0

The workflow will expect fastq to be named after either the value of the `isolate_name_alt` if it is filled
or the value of the `isolate_id` field if `isolate_name_alt` is empty. If a fastq has a name that is not in these fields,
the workflow will stop with an error.

### 0.1.1

Add missing metadata field "assembly_method" in metadata JSON for geuebt export
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.1.1
0.2.0
8 changes: 6 additions & 2 deletions docs/user_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,21 @@ File naming should explicitely follow one the 4 schemes:
This naming convention is used to generate a sample sheet with the AQUAMIS `create_sampleSheet` helper.
See the documentation directly in the AQUAMIS repository.

Note that the workflow will expect fastq to be named after either the value of the `isolate_name_alt` if it is filled
or the value of the `isolate_id` field if `isolate_name_alt` is empty. If a fastq has a name that is not in these fields,
the workflow will stop with an error.

## Metadata

Metadata hould be provided as a table ina a flat text format, using tab separators.
The table must contain following fields, not all of which must be filled:

| Field | Description | Required |
|---|---|---|
| isolate_id | Unique data identification in database | required |
| isolate_id | Unique isolate identification in database | required |
| sample_id | Unique sample identification for epidemiological analysis | required |
| organims | Sample categorization for downstream analysis | required |
| isolate_name_alt | Laboratory internal name | optional |
| isolate_name_alt | Laboratory internal name (unique for each isolate) | optional |
| isolation_org | Contact organisation for isolate | required |
| sequencing_org | Contact organisation for sequencing | required |
| extraction_method | Extraction method name | optional |
Expand Down
17 changes: 16 additions & 1 deletion workflow/rules/aquamis.smk
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,24 @@ rule create_sample_sheet:
"""


checkpoint aquamis:
rule consolidate_ids:
input:
sample_sheet="sample_sheet/samples.tsv",
params:
metadata=config["metadata"],
output:
sample_sheet="sample_sheet/samples_isolate_ids.tsv",
conda:
"../envs/pandas.yaml"
log:
"logs/consolidate_ids.log",
script:
"../scripts/consolidate_ids.py"


checkpoint aquamis:
input:
sample_sheet="sample_sheet/samples_isolate_ids.tsv",
output:
outdir=directory("aquamis"),
summary="aquamis/reports/summary_report.tsv",
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/exports.smk
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ rule nrls_export:
input:
# using geuebt table here becaus it's already qc checked
metadata="geuebt_export/metadata.tsv",
ssheet="sample_sheet/samples.tsv",
ssheet="sample_sheet/samples_isolate_ids.tsv",
output:
outdir=directory("nrls_export"),
flag=touch("nrls_export/sucess.flag"),
Expand Down
61 changes: 61 additions & 0 deletions workflow/scripts/consolidate_ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


import sys


# if not calling for snakemake rule
try:
sys.stderr = open(snakemake.log[0], "w")
except NameError:
pass


import pandas as pd


def main(ssheet, metadata, sheetout):
# load metadata and samplesheet
metatbl = pd.read_csv(metadata, sep="\t", index_col=False)
sample_sheet = pd.read_csv(ssheet, sep="\t", index_col="sample")
new_index = []
# for each fastq pair, check that there is a corresponding entry in metadata
for sname in sample_sheet.index.to_list():
selection = metatbl.loc[metatbl["isolate_name_alt"] == sname]
if len(selection) == 0:
selection = metatbl.loc[metatbl["isolate_id"] == sname]
if len(selection) == 0:
# Crash if name not found
raise KeyError(
f"There is not information on sample '{sname}' in the metadata table, "
f"althought valid FASTQs were provided. "
f"Ensure the completness of the submitted metadata. "
f"The workflow will expect fastq to be named after either the value "
f"the `isolate_id` field if `isolate_name_alt` is empty."
)
elif len(selection) > 1:
# crash if name not unique
raise not ValueError(
f"Several entries for sample name '{sname}' were found in the metadata table. "
f" Make sure that both the fields `isolate_id` and `isolate_name_alt` "
f"contain unique values"
)
else:
# add isolate id to reindexing
new_index.append(selection.iloc[0]["isolate_id"])

# resindex
sample_sheet.reset_index(inplace=True, names="fastq_name")
reindexed_sheet = sample_sheet.set_index(pd.Index(new_index))
reindexed_sheet.index.name = "sample"
# output
reindexed_sheet.to_csv(sheetout, sep="\t", header=True, index=True)


if __name__ == "__main__":
main(
snakemake.input["sample_sheet"],
snakemake.params["metadata"],
snakemake.output["sample_sheet"],
)
4 changes: 3 additions & 1 deletion workflow/scripts/geuebt_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ def main(assemblies, summary, metadata_in, metadata_out):
raise KeyError(
f"There is not information on sample '{name}' in the metadata table, "
f"althought valid FASTQs were provided. "
f"Ensure the completness of the submitted metadata."
f"Ensure the completness of the submitted metadata. "
f"The workflow will expect fastq to be named after either the value "
f"the `isolate_id` field if `isolate_name_alt` is empty."
)
# iof QC fail skip sample
if sumtbl.at[name, "QC_Vote"] == "FAIL":
Expand Down
6 changes: 4 additions & 2 deletions workflow/scripts/nrls_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,12 @@ def main(metadata, ssheet, outdir):
for row in tbl.iterrows(): # yields (index, Series)
for fastqpath in (row[1]["fq1"], row[1]["fq2"]):
filename = os.path.basename(fastqpath)
# rename files with isolate_id
filename_id = filename.replace(row[1]["fastq_name"], row[0])
# copy fastq
shutil.copy(fastqpath, os.path.join(outdir, species, filename))
shutil.copy(fastqpath, os.path.join(outdir, species, filename_id))
# get checksum
checksums.append(f"{md5(fastqpath)} {filename}")
checksums.append(f"{md5(fastqpath)} {filename_id}")
# create a one row df for metadata
metanrl.append(pd.DataFrame.from_dict(
{row[0]: [
Expand Down

0 comments on commit 6be8480

Please sign in to comment.