Skip to content

Commit

Permalink
Merge pull request #65 from gregdenay/master
Browse files Browse the repository at this point in the history
Fix for `trim_primers_3end` parameter and primer sequences containing ambiguous nucleotides
  • Loading branch information
gregdenay authored Aug 18, 2023
2 parents 01c6a0e + 6ec0f9c commit bad6d4f
Show file tree
Hide file tree
Showing 23 changed files with 273 additions and 151 deletions.
3 changes: 3 additions & 0 deletions .github/linters/.flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[flake8]
max-line-length = 120
extend-ignore = E203, F821, E402
36 changes: 36 additions & 0 deletions .github/linters/.markdown-lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
---
###########################
###########################
## Markdown Linter rules ##
###########################
###########################

# Linter rules doc:
# - https://github.com/DavidAnson/markdownlint
#
# Note:
# To comment out a single error:
# <!-- markdownlint-disable -->
# any violations you want
# <!-- markdownlint-restore -->
#

###############
# Rules by id #
###############
MD004: false # Unordered list style
MD007:
indent: 2 # Unordered list indentation
MD013:
line_length: 400 # Line length 80 is far to short
MD026:
punctuation: ".,;:!。,;:" # List of not allowed
MD029: false # Ordered list item prefix
MD033: false # Allow inline HTML
MD036: false # Emphasis used instead of a heading
MD041: false # First line in a file should be a top-level heading

#################
# Rules by tags #
#################
blank_lines: false # Error on blank lines
30 changes: 12 additions & 18 deletions .github/workflows/linter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,34 +12,28 @@ on:
jobs:
formatting:
runs-on: ubuntu-latest
permissions:
contents: read
packages: read
statuses: write

steps:

- name: Checkout Code
uses: actions/checkout@v2
uses: actions/checkout@v3
with:
fetch-depth: 0

- name: Lint
uses: github/super-linter@v4
uses: github/super-linter@v5
env:
VALIDATE_ALL_CODEBASE: false
VALIDATE_ALL_CODEBASE: true
DEFAULT_BRANCH: master
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
VALIDATE_SNAKEMAKE_SNAKEFMT: true
VALIDATE_PYTHON_FLAKE8: true
VALIDATE_R: true

# jobs:
# formatting:
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v2
# - name: Checkout code
# uses: textbook/[email protected]
# - name: Formatting
# uses: github/super-linter@v4
# env:
# VALIDATE_ALL_CODEBASE: false
# DEFAULT_BRANCH: master
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# VALIDATE_SNAKEMAKE_SNAKEFMT: true

linting:
runs-on: ubuntu-latest
steps:
Expand Down
6 changes: 2 additions & 4 deletions .tests/data/primers/16S.fa
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
>FwdM
GACGAGAAGACCCTATGGAGC
>RevM
TCCGAGGTCACCCCAACC
>FwdP
GACGAGAAGACCCTGTGGAAC
>RevP
TCCAAGGTCGCCCCAACC
>Rev
TCCGAGGTCRCCCCAACC
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
### 1.6.6

#### Fixes

- Corrects parsing of the `trim_primers_3end` parameter (#64)
- Added a primer disambiguation step that converts primers sequences in the IUPAC ambiguous nucleotide format to their explicit forms (#63)

### 1.6.5

#### Fixes
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.6.5
1.6.6
2 changes: 1 addition & 1 deletion docs/userguide/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ This will create a file called `samples.tsv` in the `raw_data` folder.
| `samples` | Path | Path to the sample sheet |
| `threads_sample` | Number | Number of threads assigned to each job |
| `threads` | Number | Number of threads assigned to the workflow |
| `primers_fasta` | Path | Path to the fasta file containing primer sequences |
| `primers_fasta` | Path | Path to the fasta file containing primer sequences.<br>IUPAC ambiguous nuclotides are accepted.|
| `blast_DB` | Path | Path to the BLAST database in the form <br>`path/to/folder/db-name` |
| `taxdb` | Path | Path to the folder containing the `taxdb`files |
| `rankedlineage_dmp` | Path | Path to the `rankedlineage.dmp` file from the <br>`taxdump` archive |
Expand Down
3 changes: 2 additions & 1 deletion workflow/envs/pandas.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ channels:
- conda-forge
dependencies:
- pandas=1.4.2
- scikit-learn=1.1.1
- scikit-learn=1.1.1
- biopython=1.81
2 changes: 1 addition & 1 deletion workflow/rules/blast.smk
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ rule collect_blast_stats:
"reports/blast_stats.tsv",
caption="../report/blast_stats.rst",
category="Quality controls",
)
),
message:
"[All][assignement] aggregating BLAST stats"
conda:
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/common_benchmark.smk
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,4 @@ def get_acc_blocklist(wildcards):
if config["seq_blocklist"] == "None":
return f"{wildcards.sample}/taxonomy/{wildcards.sample}_blast_report.tsv"
else:
return f"{wildcards.sample}/taxonomy/{wildcards.sample}_blast_report_prefiltered.tsv"
return f"{wildcards.sample}/taxonomy/{wildcards.sample}_blast_report_prefiltered.tsv"
21 changes: 17 additions & 4 deletions workflow/rules/trimming.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,24 @@ shell.executable("bash")
# Rules primers trimming


rule get_primer_revcomp:
rule primer_disambiguation:
output:
primers_rc=temp("common/primer_revcomp.fa"),
primers=temp("common/primer_explicit.fa"),
params:
primers=config["primers_fasta"],
conda:
"../envs/pandas.yaml"
log:
"logs/primer_disambiguation.log",
script:
"../scripts/primer_disambiguation.py"


rule get_primer_revcomp:
input:
primers="common/primer_explicit.fa",
output:
primers_rc=temp("common/primer_revcomp.fa"),
message:
"[Common][trimming] reverse-complementing primers"
conda:
Expand All @@ -17,7 +30,7 @@ rule get_primer_revcomp:
"logs/common/primer_revcomp.log",
shell:
"""
seqtk seq -r {params.primers} 1> {output.primers_rc} 2> {log}
seqtk seq -r {input.primers} 1> {output.primers_rc} 2> {log}
"""


Expand Down Expand Up @@ -46,7 +59,7 @@ rule cutadapt:
shell:
"""
# Simple case only 5p trimming
if [[ {params.primer_3p} == false ]]
if [[ {params.primer_3p} == False ]]
then
cutadapt {input.r1} \
{input.r2} \
Expand Down
10 changes: 5 additions & 5 deletions workflow/scripts/append_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,17 @@ def main(
benchmarkin, confmatin, yieldsin, metricsin, pr_curvein,
benchmarkout, confmatout, yieldsout, metricsout, pr_curveout,
pspace,
):
):

for fin, fout in [
(benchmarkin, benchmarkout),
(confmatin, confmatout),
(yieldsin, yieldsout),
(metricsin, metricsout),
(pr_curvein, pr_curveout),
]:
tbl=pd.read_csv(fin, sep="\t")
for k,v in pspace.items():
]:
tbl = pd.read_csv(fin, sep="\t")
for k, v in pspace.items():
tbl[k] = v
tbl.to_csv(fout, sep="\t", header=True, index=False)

Expand Down
16 changes: 8 additions & 8 deletions workflow/scripts/benchmark_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,32 +11,32 @@
import numpy as np
import pandas as pd
from sklearn.metrics import (
precision_score,
recall_score,
f1_score,
precision_score,
recall_score,
f1_score,
average_precision_score,
)


def main(confmat, output, sample):
conf_table = pd.read_csv(confmat, sep="\t")

# get classification metrics
precision = precision_score(conf_table['expected'], conf_table['predicted'])
recall = recall_score(conf_table['expected'], conf_table['predicted'])
fscore = f1_score(conf_table['expected'], conf_table['predicted'])
prauc = average_precision_score(conf_table['expected'], conf_table['pred_ratio'])

# Get quantification metrics only on expected Taxids!
exponly = conf_table.loc[conf_table['expected'] == 1]
l2dist = np.linalg.norm(np.array(exponly['pred_ratio'])-np.array(exponly['exp_ratio']))
# error = mean(abs(pred - exp))
err = np.mean(np.absolute(np.array(exponly['pred_ratio'])-np.array(exponly['exp_ratio'])))

with open(output, "w") as fout:
fout.write("\t".join(["Sample", "Precision", "Recall", "F1 score", "Average precision", "Distance", "Error"]))
fout.write("\n")
fout.write("\t".join([sample, str(precision), str(recall), str(fscore), str(prauc), str(l2dist),str(err)]))
fout.write("\t".join([sample, str(precision), str(recall), str(fscore), str(prauc), str(l2dist), str(err)]))
fout.write("\n")


Expand All @@ -45,4 +45,4 @@ def main(confmat, output, sample):
confmat=snakemake.input['confmat'],
output=snakemake.output['metrics'],
sample=snakemake.params['sample'],
)
)
8 changes: 4 additions & 4 deletions workflow/scripts/conda_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,19 @@ def extract_package_version(envfile):
env = yaml.safe_load(stream)
for dep in env['dependencies']:
p, v = dep.split("=")
yield p,v
yield p, v


def main(report, basedir):
mypath = os.path.join(basedir, "envs")
envs = [
os.path.join(mypath, f) for f in os.listdir(mypath)
os.path.join(mypath, f) for f in os.listdir(mypath)
if os.path.isfile(os.path.join(mypath, f)) and f.lower().endswith(('.yaml', '.yml'))
]
df = []
for ef in envs:
for p,v in extract_package_version(ef):
df.append({'Package': p, 'Version':v})
for p, v in extract_package_version(ef):
df.append({'Package': p, 'Version': v})
df = pd.DataFrame(df)
df.sort_values('Package').to_csv(report, sep="\t", header=True, index=False)

Expand Down
5 changes: 1 addition & 4 deletions workflow/scripts/config_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@


import yaml
import pandas as pd


def main(default_config_file, params, output):
Expand All @@ -22,7 +21,7 @@ def main(default_config_file, params, output):

config.update(params)

dump = "\n".join([f"{k}: {v}" for k,v in config.items()])
dump = "\n".join([f"{k}: {v}" for k, v in config.items()])
with open(output, 'w') as stream:
stream.write(dump)

Expand All @@ -33,5 +32,3 @@ def main(default_config_file, params, output):
params=snakemake.params['pspace'],
output=snakemake.output['conf'],
)


Loading

0 comments on commit bad6d4f

Please sign in to comment.