From 3ad56936fe983c351adb5e46414940eb5e734678 Mon Sep 17 00:00:00 2001 From: gregdenay Date: Fri, 19 Apr 2024 13:41:20 +0200 Subject: [PATCH 1/6] add bare-bone helper script to create a correctly formatted reference database out of the MIDORI2 UNIQ 16S database --- ressources/create_MIDORI_blastdb.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 ressources/create_MIDORI_blastdb.sh diff --git a/ressources/create_MIDORI_blastdb.sh b/ressources/create_MIDORI_blastdb.sh new file mode 100644 index 0000000..2d538dc --- /dev/null +++ b/ressources/create_MIDORI_blastdb.sh @@ -0,0 +1,20 @@ +# Get MIDORI Fasta and reformat header +wget -S -P ./source/ https://www.reference-midori.info/forceDownload.php?fName=download/Databases/GenBank259_2023-12-17/BLAST/uniq/fasta/MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta.zip \ + && gunzip -c ./source/MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta.zip > ./source/MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta \ + && cut -d '.' -f1 ./source/MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta > MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta + +# taxdb +wget -S -P ./source/ https://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz \ + && tar -xzvf ./source/taxdb.tar.gz + +# taxdump +wget -S -P ./source/ https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz \ + && tar -xzvf ./source/new_taxdump.tar.gz + +# taxid file +wget -S -P ./source/ https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz \ + && gunzip -c ./source/nucl_gb.accession2taxid.gz > ./source/nucl_gb.accession2taxid \ + && cut -f 1,3 ./source/nucl_gb.accession2taxid | tail -n +2 > genbank2taxid + +# Make db +makeblastdb -in MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta -parse_seqids -blastdb_version 5 -taxid_map genbank2taxid -dbtype nucl From e09b18e62a295dee1bb5635e3b640d1a77dd448c Mon Sep 17 00:00:00 2001 From: gregdenay Date: Fri, 19 Apr 2024 13:50:20 +0200 Subject: [PATCH 2/6] fix MIDORI script --- ressources/create_MIDORI_blastdb.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ressources/create_MIDORI_blastdb.sh b/ressources/create_MIDORI_blastdb.sh index 2d538dc..8d51af3 100644 --- a/ressources/create_MIDORI_blastdb.sh +++ b/ressources/create_MIDORI_blastdb.sh @@ -1,7 +1,7 @@ # Get MIDORI Fasta and reformat header wget -S -P ./source/ https://www.reference-midori.info/forceDownload.php?fName=download/Databases/GenBank259_2023-12-17/BLAST/uniq/fasta/MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta.zip \ && gunzip -c ./source/MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta.zip > ./source/MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta \ - && cut -d '.' -f1 ./source/MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta > MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta + && cut -d '.' -f1,2 ./source/MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta > MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta # taxdb wget -S -P ./source/ https://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz \ @@ -14,7 +14,7 @@ wget -S -P ./source/ https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_t # taxid file wget -S -P ./source/ https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz \ && gunzip -c ./source/nucl_gb.accession2taxid.gz > ./source/nucl_gb.accession2taxid \ - && cut -f 1,3 ./source/nucl_gb.accession2taxid | tail -n +2 > genbank2taxid + && cut -f 2,3 ./source/nucl_gb.accession2taxid | tail -n +2 > genbank2taxid # Make db makeblastdb -in MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta -parse_seqids -blastdb_version 5 -taxid_map genbank2taxid -dbtype nucl From b0264a15d5d7f3cb33303c2dc93dd6f66b64fabf Mon Sep 17 00:00:00 2001 From: gregdenay Date: Fri, 19 Apr 2024 13:53:51 +0200 Subject: [PATCH 3/6] compatibility snakemake8 --- workflow/paramspace | 2 +- workflow/rules/common_benchmark.smk | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/workflow/paramspace b/workflow/paramspace index 0f55962..15b24f3 100644 --- a/workflow/paramspace +++ b/workflow/paramspace @@ -68,7 +68,7 @@ rule run_foodme_benchmark: workdir=f"foodme_runs/{paramspace.wildcard_pattern}", snakefile=os.path.join(workflow.basedir, "benchmark"), cores=workflow.cores, - conda_prefix=workflow.conda_prefix, + conda_prefix=get_conda_prefix, force_rerun="--forceall" if config["force_rerun"] else "" log: f"logs/{paramspace.wildcard_pattern}/snakemake.log" diff --git a/workflow/rules/common_benchmark.smk b/workflow/rules/common_benchmark.smk index a0c2870..42150a8 100644 --- a/workflow/rules/common_benchmark.smk +++ b/workflow/rules/common_benchmark.smk @@ -36,6 +36,15 @@ def get_local_time(): return time.asctime(time.localtime(time.time())) +def get_conda_prefix(wildcards): + try: + # snakemake < 8.0 + return workflow.conda_prefix + except: + # snakemake > 8 + return workflow.deployment_settings.conda_prefix + + # Input functions ------------------------------------ def get_fastq(wildcards, read_pair="fq1"): return samples.loc[(wildcards.sample), [read_pair]].dropna()[0] From 1ad6b25668a9214e2799ed89d714c90a5c0bfc8b Mon Sep 17 00:00:00 2001 From: gregdenay Date: Fri, 19 Apr 2024 14:26:15 +0200 Subject: [PATCH 4/6] MIDORI script taking almost full ids to avoid duplicate ids --- ressources/create_MIDORI_blastdb.sh | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/ressources/create_MIDORI_blastdb.sh b/ressources/create_MIDORI_blastdb.sh index 8d51af3..6d4d652 100644 --- a/ressources/create_MIDORI_blastdb.sh +++ b/ressources/create_MIDORI_blastdb.sh @@ -1,7 +1,14 @@ # Get MIDORI Fasta and reformat header wget -S -P ./source/ https://www.reference-midori.info/forceDownload.php?fName=download/Databases/GenBank259_2023-12-17/BLAST/uniq/fasta/MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta.zip \ && gunzip -c ./source/MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta.zip > ./source/MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta \ - && cut -d '.' -f1,2 ./source/MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta > MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta + && cut -d '#' -f1 ./source/MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta \ + | tr -d '<' \ + | sed 's/^>/@/' \ + | tr -d '>' \ + | tr '@' '>' \ + | cut -d ',' -f1,2 \ + | tr ',' '_' \ + > MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta # taxdb wget -S -P ./source/ https://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz \ @@ -12,9 +19,11 @@ wget -S -P ./source/ https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_t && tar -xzvf ./source/new_taxdump.tar.gz # taxid file -wget -S -P ./source/ https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz \ - && gunzip -c ./source/nucl_gb.accession2taxid.gz > ./source/nucl_gb.accession2taxid \ - && cut -f 2,3 ./source/nucl_gb.accession2taxid | tail -n +2 > genbank2taxid +# Get taxid from fasta headers +paste \ + <(cat ./source/MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta | grep '^>' | cut -d '#' -f1 | tr -d '<' | tr -d '>' | cut -d ',' -f1,2 | tr ',' '_') \ + <(cat ./source/MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta | grep '^>' | rev | cut -d '_' -f1 | rev) \ + > ids2taxid # Make db -makeblastdb -in MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta -parse_seqids -blastdb_version 5 -taxid_map genbank2taxid -dbtype nucl +makeblastdb -in MIDORI2_UNIQ_NUC_GB259_lrRNA_BLAST.fasta -parse_seqids -blastdb_version 5 -taxid_map ids2taxid -dbtype nucl From e7a1a2017385e200905e8c71d9556cb65739fd00 Mon Sep 17 00:00:00 2001 From: gregdenay Date: Fri, 19 Apr 2024 14:30:46 +0200 Subject: [PATCH 5/6] fix compatibility for snakemake8 --- workflow/paramspace | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflow/paramspace b/workflow/paramspace index 15b24f3..43c33b1 100644 --- a/workflow/paramspace +++ b/workflow/paramspace @@ -20,6 +20,9 @@ workdir: config["workdir"] paramspace = Paramspace(pd.read_csv(config['paramspace'], sep="\t")) +include: "rules/common_benchmark.smk" + + # Input rule ------------------------------------------------------------------ From 030b0baf8c3f4ea5ed4898a69f962e2ff646fc24 Mon Sep 17 00:00:00 2001 From: gregdenay Date: Fri, 19 Apr 2024 14:38:23 +0200 Subject: [PATCH 6/6] fix compatibility for snakemake8 --- workflow/paramspace | 11 ++++++++++- workflow/rules/common_benchmark.smk | 9 --------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/workflow/paramspace b/workflow/paramspace index 43c33b1..510c10d 100644 --- a/workflow/paramspace +++ b/workflow/paramspace @@ -20,7 +20,16 @@ workdir: config["workdir"] paramspace = Paramspace(pd.read_csv(config['paramspace'], sep="\t")) -include: "rules/common_benchmark.smk" +# Snakemake 8 compatibility ---------------------------------------------------- + + +def get_conda_prefix(wildcards): + try: + # snakemake < 8.0 + return workflow.conda_prefix + except: + # snakemake > 8 + return workflow.deployment_settings.conda_prefix # Input rule ------------------------------------------------------------------ diff --git a/workflow/rules/common_benchmark.smk b/workflow/rules/common_benchmark.smk index 42150a8..a0c2870 100644 --- a/workflow/rules/common_benchmark.smk +++ b/workflow/rules/common_benchmark.smk @@ -36,15 +36,6 @@ def get_local_time(): return time.asctime(time.localtime(time.time())) -def get_conda_prefix(wildcards): - try: - # snakemake < 8.0 - return workflow.conda_prefix - except: - # snakemake > 8 - return workflow.deployment_settings.conda_prefix - - # Input functions ------------------------------------ def get_fastq(wildcards, read_pair="fq1"): return samples.loc[(wildcards.sample), [read_pair]].dropna()[0]