Merge pull request #65 from gregdenay/master

Fix for `trim_primers_3end` parameter and primer sequences containing ambiguous nucleotides
CVUA-RRW · Aug 18, 2023 · bad6d4f · bad6d4f
2 parents 01c6a0e + 6ec0f9c
commit bad6d4f
Show file tree

Hide file tree

Showing 23 changed files with 273 additions and 151 deletions.
diff --git a/.github/linters/.flake8 b/.github/linters/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 120
+extend-ignore = E203, F821, E402
diff --git a/.github/linters/.markdown-lint.yml b/.github/linters/.markdown-lint.yml
@@ -0,0 +1,36 @@
+---
+###########################
+###########################
+## Markdown Linter rules ##
+###########################
+###########################
+
+# Linter rules doc:
+# - https://github.com/DavidAnson/markdownlint
+#
+# Note:
+# To comment out a single error:
+#   <!-- markdownlint-disable -->
+#   any violations you want
+#   <!-- markdownlint-restore -->
+#
+
+###############
+# Rules by id #
+###############
+MD004: false                  # Unordered list style
+MD007:
+  indent: 2                   # Unordered list indentation
+MD013:
+  line_length: 400            # Line length 80 is far to short
+MD026:
+  punctuation: ".,;:!。，；:"    # List of not allowed
+MD029: false                  # Ordered list item prefix
+MD033: false                  # Allow inline HTML
+MD036: false                  # Emphasis used instead of a heading
+MD041: false                  # First line in a file should be a top-level heading
+
+#################
+# Rules by tags #
+#################
+blank_lines: false  # Error on blank lines
diff --git a/.github/workflows/linter.yaml b/.github/workflows/linter.yaml
@@ -12,34 +12,28 @@ on:
 jobs:
   formatting:
     runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: read
+      statuses: write
+
     steps:
 
     - name: Checkout Code
-      uses: actions/checkout@v2
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 0
 
     - name: Lint
-      uses: github/super-linter@v4
+      uses: github/super-linter@v5
       env:
-        VALIDATE_ALL_CODEBASE: false
+        VALIDATE_ALL_CODEBASE: true
         DEFAULT_BRANCH: master
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         VALIDATE_SNAKEMAKE_SNAKEFMT: true
+        VALIDATE_PYTHON_FLAKE8: true
+        VALIDATE_R: true
 
-# jobs:
-  # formatting:
-    # runs-on: ubuntu-latest
-    # steps:
-    # - uses: actions/checkout@v2
-    # - name: Checkout code
-      # uses: textbook/[email protected]
-    # - name: Formatting
-      # uses: github/super-linter@v4
-      # env:
-        # VALIDATE_ALL_CODEBASE: false
-        # DEFAULT_BRANCH: master
-        # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        # VALIDATE_SNAKEMAKE_SNAKEFMT: true
-
   linting:
     runs-on: ubuntu-latest
     steps:

diff --git a/.tests/data/primers/16S.fa b/.tests/data/primers/16S.fa
@@ -1,8 +1,6 @@
 >FwdM
 GACGAGAAGACCCTATGGAGC
->RevM
-TCCGAGGTCACCCCAACC
 >FwdP
 GACGAGAAGACCCTGTGGAAC
->RevP
-TCCAAGGTCGCCCCAACC
+>Rev
+TCCGAGGTCRCCCCAACC
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,10 @@
+### 1.6.6
+
+#### Fixes
+
+- Corrects parsing of the `trim_primers_3end` parameter (#64)
+- Added a primer disambiguation step that converts primers sequences in the IUPAC ambiguous nucleotide format to their explicit forms (#63)
+
 ### 1.6.5
 
 #### Fixes

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.6.5
+1.6.6
diff --git a/docs/userguide/configuration.md b/docs/userguide/configuration.md
@@ -60,7 +60,7 @@ This will create a file called `samples.tsv` in the `raw_data` folder.
 | `samples`                 | Path                      | Path to the sample sheet                           |
 | `threads_sample`          | Number                    | Number of threads assigned to each job             |
 | `threads`                 | Number                    | Number of threads assigned to the workflow         |
-| `primers_fasta`           | Path                      | Path to the fasta file containing primer sequences |
+| `primers_fasta`           | Path                      | Path to the fasta file containing primer sequences.<br>IUPAC ambiguous nuclotides are accepted.|
 | `blast_DB`                | Path                      | Path to the BLAST database in the form <br>`path/to/folder/db-name` |
 | `taxdb`                   | Path                      | Path to the folder containing the `taxdb`files |
 | `rankedlineage_dmp`       | Path                      | Path to the `rankedlineage.dmp` file from the <br>`taxdump` archive |

diff --git a/workflow/envs/pandas.yaml b/workflow/envs/pandas.yaml
@@ -3,4 +3,5 @@ channels:
   - conda-forge
 dependencies:
   - pandas=1.4.2
-  - scikit-learn=1.1.1
+  - scikit-learn=1.1.1
+  - biopython=1.81
diff --git a/workflow/rules/blast.smk b/workflow/rules/blast.smk
@@ -287,7 +287,7 @@ rule collect_blast_stats:
             "reports/blast_stats.tsv",
             caption="../report/blast_stats.rst",
             category="Quality controls",
-        )
+        ),
     message:
         "[All][assignement] aggregating BLAST stats"
     conda:

diff --git a/workflow/rules/common_benchmark.smk b/workflow/rules/common_benchmark.smk
@@ -61,4 +61,4 @@ def get_acc_blocklist(wildcards):
     if config["seq_blocklist"] == "None":
         return f"{wildcards.sample}/taxonomy/{wildcards.sample}_blast_report.tsv"
     else:
-        return f"{wildcards.sample}/taxonomy/{wildcards.sample}_blast_report_prefiltered.tsv"
+        return f"{wildcards.sample}/taxonomy/{wildcards.sample}_blast_report_prefiltered.tsv"
diff --git a/workflow/rules/trimming.smk b/workflow/rules/trimming.smk
@@ -4,11 +4,24 @@ shell.executable("bash")
 # Rules primers trimming
 
 
-rule get_primer_revcomp:
+rule primer_disambiguation:
     output:
-        primers_rc=temp("common/primer_revcomp.fa"),
+        primers=temp("common/primer_explicit.fa"),
     params:
         primers=config["primers_fasta"],
+    conda:
+        "../envs/pandas.yaml"
+    log:
+        "logs/primer_disambiguation.log",
+    script:
+        "../scripts/primer_disambiguation.py"
+
+
+rule get_primer_revcomp:
+    input:
+        primers="common/primer_explicit.fa",
+    output:
+        primers_rc=temp("common/primer_revcomp.fa"),
     message:
         "[Common][trimming] reverse-complementing primers"
     conda:
@@ -17,7 +30,7 @@ rule get_primer_revcomp:
         "logs/common/primer_revcomp.log",
     shell:
         """
-        seqtk seq -r {params.primers} 1> {output.primers_rc} 2> {log}
+        seqtk seq -r {input.primers} 1> {output.primers_rc} 2> {log}
         """
 
 
@@ -46,7 +59,7 @@ rule cutadapt:
     shell:
         """
         # Simple case only 5p trimming
-        if [[ {params.primer_3p} == false ]]
+        if [[ {params.primer_3p} == False ]]
         then
             cutadapt {input.r1} \
                 {input.r2} \

diff --git a/workflow/scripts/append_params.py b/workflow/scripts/append_params.py
@@ -15,17 +15,17 @@ def main(
     benchmarkin, confmatin, yieldsin, metricsin, pr_curvein,
     benchmarkout, confmatout, yieldsout, metricsout, pr_curveout,
     pspace,
-    ):
-    
+):
+
     for fin, fout in [
         (benchmarkin, benchmarkout),
         (confmatin, confmatout),
         (yieldsin, yieldsout),
         (metricsin, metricsout),
         (pr_curvein, pr_curveout),
-        ]:
-        tbl=pd.read_csv(fin, sep="\t")
-        for k,v in pspace.items():
+    ]:
+        tbl = pd.read_csv(fin, sep="\t")
+        for k, v in pspace.items():
             tbl[k] = v
         tbl.to_csv(fout, sep="\t", header=True, index=False)
 

diff --git a/workflow/scripts/benchmark_metrics.py b/workflow/scripts/benchmark_metrics.py
@@ -11,32 +11,32 @@
 import numpy as np
 import pandas as pd
 from sklearn.metrics import (
-    precision_score, 
-    recall_score, 
-    f1_score, 
+    precision_score,
+    recall_score,
+    f1_score,
     average_precision_score,
 )
 
 
 def main(confmat, output, sample):
     conf_table = pd.read_csv(confmat, sep="\t")
-    
+
     # get classification metrics
     precision = precision_score(conf_table['expected'], conf_table['predicted'])
     recall = recall_score(conf_table['expected'], conf_table['predicted'])
     fscore = f1_score(conf_table['expected'], conf_table['predicted'])
     prauc = average_precision_score(conf_table['expected'], conf_table['pred_ratio'])
-    
+
     # Get quantification metrics only on expected Taxids!
     exponly = conf_table.loc[conf_table['expected'] == 1]
     l2dist = np.linalg.norm(np.array(exponly['pred_ratio'])-np.array(exponly['exp_ratio']))
     # error = mean(abs(pred - exp))
     err = np.mean(np.absolute(np.array(exponly['pred_ratio'])-np.array(exponly['exp_ratio'])))
-    
+
     with open(output, "w") as fout:
         fout.write("\t".join(["Sample", "Precision", "Recall", "F1 score", "Average precision", "Distance", "Error"]))
         fout.write("\n")
-        fout.write("\t".join([sample, str(precision), str(recall), str(fscore), str(prauc), str(l2dist),str(err)]))
+        fout.write("\t".join([sample, str(precision), str(recall), str(fscore), str(prauc), str(l2dist), str(err)]))
         fout.write("\n")
 
 
@@ -45,4 +45,4 @@ def main(confmat, output, sample):
         confmat=snakemake.input['confmat'],
         output=snakemake.output['metrics'],
         sample=snakemake.params['sample'],
-    )
+    )
diff --git a/workflow/scripts/conda_collector.py b/workflow/scripts/conda_collector.py
@@ -18,19 +18,19 @@ def extract_package_version(envfile):
         env = yaml.safe_load(stream)
         for dep in env['dependencies']:
             p, v = dep.split("=")
-            yield p,v
+            yield p, v
 
 
 def main(report, basedir):
     mypath = os.path.join(basedir, "envs")
     envs = [
-        os.path.join(mypath, f) for f in os.listdir(mypath) 
+        os.path.join(mypath, f) for f in os.listdir(mypath)
         if os.path.isfile(os.path.join(mypath, f)) and f.lower().endswith(('.yaml', '.yml'))
     ]
     df = []
     for ef in envs:
-        for p,v in extract_package_version(ef):
-            df.append({'Package': p, 'Version':v})
+        for p, v in extract_package_version(ef):
+            df.append({'Package': p, 'Version': v})
     df = pd.DataFrame(df)
     df.sort_values('Package').to_csv(report, sep="\t", header=True, index=False)
 

diff --git a/workflow/scripts/config_writer.py b/workflow/scripts/config_writer.py
@@ -9,7 +9,6 @@
 
 
 import yaml
-import pandas as pd
 
 
 def main(default_config_file, params, output):
@@ -22,7 +21,7 @@ def main(default_config_file, params, output):
 
     config.update(params)
 
-    dump = "\n".join([f"{k}: {v}" for k,v in config.items()])
+    dump = "\n".join([f"{k}: {v}" for k, v in config.items()])
     with open(output, 'w') as stream:
         stream.write(dump)
 
@@ -33,5 +32,3 @@ def main(default_config_file, params, output):
         params=snakemake.params['pspace'],
         output=snakemake.output['conf'],
     )
-
-