From 76b0f6cbc7a604aed137e6ed9f8318f5fdc2aa28 Mon Sep 17 00:00:00 2001 From: Julian Thomas Mohr Date: Sat, 23 Mar 2024 13:19:58 +0100 Subject: [PATCH 1/8] Update MuTect2 PON script and move it to separate dir --- mutect2_pon.nf | 132 ---------------------------- panel_of_normals/main.nf | 145 +++++++++++++++++++++++++++++++ panel_of_normals/nextflow.config | 81 +++++++++++++++++ 3 files changed, 226 insertions(+), 132 deletions(-) delete mode 100755 mutect2_pon.nf create mode 100755 panel_of_normals/main.nf create mode 100644 panel_of_normals/nextflow.config diff --git a/mutect2_pon.nf b/mutect2_pon.nf deleted file mode 100755 index bc10b8e..0000000 --- a/mutect2_pon.nf +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env nextflow - -gatk4_jar = "/code/gatk/4.1.3.0/gatk-package-4.1.3.0-local.jar" -// this version is needed to build a PON from multiple VCFs, it may change in the future -gatk40_jar = "/code/gatk/4.0.12.0/gatk-package-4.0.12.0-local.jar" -picard_jar = "/code/picard/2.21.2/picard.jar" - -params.help= false -params.input_files = false -params.reference = "/projects/data/gatk_bundle/hg19/ucsc.hg19.fasta" // TODO: remove this hard coded bit -params.intervals = "/projects/data/gatk_bundle/hg19/hg19_refseq_exons.sorted.merged.bed" -params.output = 'output' - -def helpMessage() { - log.info""" -Usage: - mutect2_pon.nf --input_files input_files - -This workflow aims to compute a panel of normals to be used with MuTect2 - -Input: - * input_files: the path to a file containing in each row the sample name and the path to a BAM file to be included in the PON - example: - sample1 /path/to/sample1.bam - sample2 /path/to/sample2.bam - NOTE: the sample name must be set in the @SN annotation - -Optional input: - * output: the folder where to publish output - -Output: - * Output combined VCF pon.vcf - """ -} - -if (params.help) { - helpMessage() - exit 0 -} - -// checks required inputs -if (params.input_files) { - Channel - .fromPath(params.input_files) - .splitCsv(header: ['name', 'bam'], sep: "\t") - .map { row-> tuple(row.name, file(row.bam), file(row.bam.replaceAll(/.bam$/, ".bai"))) } - .set { input_files } -} else { - exit 1, "Input file not specified!" -} - -if (params.intervals) { - Channel - .fromPath(params.intervals) - .splitText(by: 30000, file: true) - .set { intervals } -} - -process mutect2Pon { - cpus 2 - memory '16g' - module 'java/1.8.0' - errorStrategy 'finish' - - input: - set val(name), file(bam), file(bai), file(interval) from input_files.combine(intervals) - - output: - set val("${bam.baseName}"), file("${bam.baseName}.${interval.baseName}.mutect.vcf") into mutect_vcfs - - """ - mkdir -p `pwd`/scratch/tmp - java -Xmx16g -Djava.io.tmpdir=`pwd`/scratch/tmp -jar ${gatk4_jar} \ - Mutect2 \ - --reference ${params.reference} \ - --intervals ${interval} \ - --input ${bam} \ - --tumor-sample ${name} \ - --max-mnp-distance 0 \ - --output ${bam.baseName}.${interval.baseName}.mutect.vcf - """ -} - -process gatherVcfs { - cpus 1 - memory '32g' - module 'java/1.8.0' - publishDir "${params.output}", mode: "copy" - - input: - set name, file(vcf_list) from mutect_vcfs.groupTuple() // group by name - - output: - file("${name}.vcf") into whole_vcfs - - script: - // NOTE: the input VCFs need to be provided in order by genomic coordinates - input_vcfs = "$vcf_list".split(" ") - .sort{ a, b -> a.tokenize(".")[-3].toInteger().compareTo b.tokenize(".")[-3].toInteger() } - .collect{"INPUT=" + it}.join(" ") - """ - mkdir -p `pwd`/scratch/tmp - java -Xmx32g -Djava.io.tmpdir=`pwd`/scratch/tmp -jar $picard_jar \ - GatherVcfs \ - ${input_vcfs} \ - OUTPUT=${name}.vcf - """ -} - -process createPON { - cpus 1 - memory '32g' - module 'java/1.8.0' - publishDir "${params.output}", mode: "copy" - - input: - file(vcf_list) from whole_vcfs.collect() - - output: - file("pon.vcf") - file("pon.vcf.idx") - - script: - input_vcfs = "$vcf_list".split(" ").collect{"--vcfs " + it}.join(" ") - """ - # combines VCFs and keeps variants occuring in at least two VCFs - mkdir -p `pwd`/scratch/tmp - java -Xmx32g -Djava.io.tmpdir=`pwd`/scratch/tmp -jar $gatk40_jar \ - CreateSomaticPanelOfNormals ${input_vcfs} \ - --output pon.vcf - """ -} diff --git a/panel_of_normals/main.nf b/panel_of_normals/main.nf new file mode 100755 index 0000000..ac0a267 --- /dev/null +++ b/panel_of_normals/main.nf @@ -0,0 +1,145 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + + +// params definitions section + +params.help= false +params.gnomad = false +params.input_files = false +params.intervals = false +params.max_mnp_distance = 0 +params.memory_mutect2 = "16g" +params.memory_create_genomics_db = "8g" +params.memory_create_pon = "32g" +params.reference = false +params.output = "output" + +// checks of required input arguments + +def helpMessage() { + log.info params.help_message +} + +if (params.help) { + helpMessage() + exit 0 +} + +if (params.input_files) { + Channel + .fromPath(params.input_files) + .splitCsv(header: ["name", "bam"], sep: "\t") + .map { row-> tuple(row.name, row.bam) } + .set { input_files } +} else { + exit 1, "The following argument is required: --input_files!" +} + +if (! params.reference) { + exit 1, "The following argument is required: --reference!" +} + +if (! params.gnomad) { + exit 1, "The following argument is required: --gnomad!" +} + +// process definitions section + +process MUTECT2 { + cpus 2 + memory params.memory_mutect2 + tag "${name}" + publishDir "${params.output}/${name}", mode: "copy" + + conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12" + + input: + tuple val(name), val(bam) + + output: + tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf"), emit: vcf + tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf.idx"), emit: vcf_idx + tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf.stats"), emit: vcf_stats + + script: + intervals_option = params.intervals ? "--intervals ${params.intervals}" : "" + + """ + gatk --java-options '-Xmx${params.memory_mutect2}' Mutect2 \\ + --reference ${params.reference} \\ + ${intervals_option} \\ + --input ${bam} \\ + --tumor-sample ${name} \\ + --max-mnp-distance ${params.max_mnp_distance} \\ + --output ${name}.mutect2.unfiltered.vcf + """ +} + +process CREATE_GENOMICS_DB { + cpus 1 + memory params.memory_create_genomics_db + tag "genomics_db" + + conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12" + + input: + val(vcf_list) + + output: + path("genomics_db", type: "dir") + + script: + intervals_option = params.intervals ? "--intervals ${params.intervals}" : "" + input_vcfs = vcf_list.collect{"--variant " + it}.join(" ") + + """ + mkdir -p `pwd`/scratch/tmp + gatk --java-options '-Xmx${params.memory_create_genomics_db}' GenomicsDBImport \\ + --reference ${params.reference} \\ + ${intervals_option} \\ + --genomicsdb-workspace-path genomics_db \\ + --tmp-dir `pwd`/scratch/tmp \\ + ${input_vcfs} + """ +} + +process CREATE_PON { + cpus 1 + memory params.memory_create_pon + tag "pon.vcf" + publishDir "${params.output}", mode: "copy" + + conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12" + + input: + path(genomics_db) + + output: + file("pon.vcf") + file("pon.vcf.idx") + + script: + """ + gatk --java-options '-Xmx${params.memory_create_pon}' CreateSomaticPanelOfNormals \\ + --reference ${params.reference} \\ + --germline-resource ${params.gnomad} \\ + --variant gendb://${genomics_db} \\ + --output pon.vcf + """ +} + +// workflow definitions section + +workflow { + MUTECT2(input_files) + + CREATE_GENOMICS_DB( + MUTECT2.out.vcf + .map { name, vcf -> vcf } + .collect() + ) + + CREATE_PON(CREATE_GENOMICS_DB.out) +} diff --git a/panel_of_normals/nextflow.config b/panel_of_normals/nextflow.config new file mode 100644 index 0000000..4c7412a --- /dev/null +++ b/panel_of_normals/nextflow.config @@ -0,0 +1,81 @@ +/* + * ------------------------------------------------- + * TRON-Bioinformatics/tronflow-mutect2 Nextflow config file + * ------------------------------------------------- + * Default config options for all environments. + */ + +profiles { + conda { + params.enable_conda = true + conda.enabled = true + conda.createTimeout = "120 min" + } + debug { + cleanup = false + dumpHashes = true + process.beforeScript = "echo $HOSTNAME" + } + mamba { + params.enable_conda = true + conda.enabled = true + conda.createTimeout = "120 min" + conda.useMamba = true + } + test { + timeline.enabled = false + report.enabled = false + trace.enabled = false + dag.enabled = false + } +} + +// export these variables to prevent local Python/R libraries from conflicting with those in the container +env { + PYTHONNOUSERSITE = 1 + R_PROFILE_USER = "/.Rprofile" + R_ENVIRON_USER = "/.Renviron" +} + +// capture exit codes from upstream processes when piping +process.shell = ["/bin/bash", "-euo", "pipefail"] + +VERSION = "1.8.0" +DOI = "https://zenodo.org/badge/latestdoi/355860788" + +manifest { + name = "TRON-Bioinformatics/tronflow-mutect2" + author = "Pablo Riesgo-Ferreiro, Özlem Muslu, Luisa Bresadola, Julian Thomas Mohr" + homePage = "https://github.com/TRON-Bioinformatics/tronflow-mutect2" + description = "Mutect2 best practices workflow to create a panel of normals" + mainScript = "main.nf" + nextflowVersion = ">=19.10.0" + version = VERSION + doi = DOI +} +params.manifest = manifest + +params.help_message = """ +${params.manifest.name} v${params.manifest.version} +Author: ${params.manifest.author} + +${params.manifest.description} + +Usage: + nextflow run ${params.manifest.mainScript} --input_files input_files + +This workflow aims to compute a panel of normals to be used with MuTect2 + +Input: + * input_files: the path to a file containing in each row the sample name and the path to a BAM file to be included in the PON + example: + sample1 /path/to/sample1.bam + sample2 /path/to/sample2.bam + NOTE: the sample name must be set in the @SN annotation + +Optional input: + * output: the folder where to publish output + +Output: + * Output combined VCF pon.vcf +""" From 7efb9a0478b21e51a41cc3611cfa1272322c9dd5 Mon Sep 17 00:00:00 2001 From: Julian Thomas Mohr Date: Sat, 23 Mar 2024 13:21:07 +0100 Subject: [PATCH 2/8] Introduce some performance improvements --- panel_of_normals/main.nf | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/panel_of_normals/main.nf b/panel_of_normals/main.nf index ac0a267..2541f93 100755 --- a/panel_of_normals/main.nf +++ b/panel_of_normals/main.nf @@ -11,7 +11,7 @@ params.input_files = false params.intervals = false params.max_mnp_distance = 0 params.memory_mutect2 = "16g" -params.memory_create_genomics_db = "8g" +params.memory_create_genomics_db = "16g" params.memory_create_pon = "32g" params.reference = false params.output = "output" @@ -67,18 +67,20 @@ process MUTECT2 { intervals_option = params.intervals ? "--intervals ${params.intervals}" : "" """ + mkdir -p `pwd`/scratch/tmp gatk --java-options '-Xmx${params.memory_mutect2}' Mutect2 \\ --reference ${params.reference} \\ ${intervals_option} \\ --input ${bam} \\ --tumor-sample ${name} \\ --max-mnp-distance ${params.max_mnp_distance} \\ - --output ${name}.mutect2.unfiltered.vcf + --output ${name}.mutect2.unfiltered.vcf \\ + --tmp-dir `pwd`/scratch/tmp """ } process CREATE_GENOMICS_DB { - cpus 1 + cpus 2 memory params.memory_create_genomics_db tag "genomics_db" @@ -101,6 +103,9 @@ process CREATE_GENOMICS_DB { ${intervals_option} \\ --genomicsdb-workspace-path genomics_db \\ --tmp-dir `pwd`/scratch/tmp \\ + --genomicsdb-shared-posixfs-optimizations true \\ + --batch-size 50 \\ + --bypass-feature-reader \\ ${input_vcfs} """ } @@ -122,11 +127,13 @@ process CREATE_PON { script: """ + mkdir -p `pwd`/scratch/tmp gatk --java-options '-Xmx${params.memory_create_pon}' CreateSomaticPanelOfNormals \\ --reference ${params.reference} \\ --germline-resource ${params.gnomad} \\ --variant gendb://${genomics_db} \\ - --output pon.vcf + --output pon.vcf \\ + --tmp-dir `pwd`/scratch/tmp """ } @@ -135,11 +142,11 @@ process CREATE_PON { workflow { MUTECT2(input_files) - CREATE_GENOMICS_DB( - MUTECT2.out.vcf - .map { name, vcf -> vcf } - .collect() - ) + // CREATE_GENOMICS_DB( + // MUTECT2.out.vcf + // .map { name, vcf -> vcf } + // .collect() + // ) - CREATE_PON(CREATE_GENOMICS_DB.out) + // CREATE_PON(CREATE_GENOMICS_DB.out) } From 47f514f2eec48d1988f566b28713713d47462cf1 Mon Sep 17 00:00:00 2001 From: Julian Thomas Mohr Date: Sun, 24 Mar 2024 12:20:54 +0100 Subject: [PATCH 3/8] Try to parallelize steps of pipeline --- panel_of_normals/main.nf | 110 +++++++++++++++++++++++++++------------ 1 file changed, 78 insertions(+), 32 deletions(-) diff --git a/panel_of_normals/main.nf b/panel_of_normals/main.nf index 2541f93..afacd0e 100755 --- a/panel_of_normals/main.nf +++ b/panel_of_normals/main.nf @@ -11,10 +11,12 @@ params.input_files = false params.intervals = false params.max_mnp_distance = 0 params.memory_mutect2 = "16g" -params.memory_create_genomics_db = "16g" +params.memory_gather_vcfs = "32g" +params.memory_genomicsdb_import = "16g" params.memory_create_pon = "32g" params.reference = false params.output = "output" +params.genomicsdb_output = "output" // checks of required input arguments @@ -45,69 +47,115 @@ if (! params.gnomad) { exit 1, "The following argument is required: --gnomad!" } +if (params.intervals) { + Channel + .fromPath(params.intervals) + .splitText(by: 30000, file: true) + .set { intervals } +} else { + exit 1, "The following argument is required: --intervals!" +} + // process definitions section process MUTECT2 { cpus 2 memory params.memory_mutect2 - tag "${name}" - publishDir "${params.output}/${name}", mode: "copy" + tag "${name}-${interval}" conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12" input: - tuple val(name), val(bam) + tuple val(name), val(bam), path(interval) output: - tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf"), emit: vcf - tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf.idx"), emit: vcf_idx - tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf.stats"), emit: vcf_stats + tuple val("${name}"), path("${name}.${interval.baseName}.mutect2.unfiltered.vcf.gz") script: - intervals_option = params.intervals ? "--intervals ${params.intervals}" : "" - """ mkdir -p `pwd`/scratch/tmp gatk --java-options '-Xmx${params.memory_mutect2}' Mutect2 \\ --reference ${params.reference} \\ - ${intervals_option} \\ + --intervals ${interval} \\ --input ${bam} \\ --tumor-sample ${name} \\ --max-mnp-distance ${params.max_mnp_distance} \\ - --output ${name}.mutect2.unfiltered.vcf \\ + --output ${name}.${interval.baseName}.mutect2.unfiltered.vcf.gz \\ --tmp-dir `pwd`/scratch/tmp """ } -process CREATE_GENOMICS_DB { - cpus 2 - memory params.memory_create_genomics_db - tag "genomics_db" +process GATHER_VCFS { + cpus 1 + memory params.memory_gather_vcfs + tag "${name}" conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12" input: - val(vcf_list) + tuple val(name), val(vcf_list) output: - path("genomics_db", type: "dir") + tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf.gz"), path("${name}.mutect2.unfiltered.vcf.gz.tbi") script: - intervals_option = params.intervals ? "--intervals ${params.intervals}" : "" - input_vcfs = vcf_list.collect{"--variant " + it}.join(" ") + // NOTE: the input VCFs need to be provided in order by genomic coordinates + input_vcfs = vcf_list + .sort { a, b -> file(a).baseName.compareTo(file(b).baseName) } + .collect{"-INPUT " + it}.join(" ") + """ + mkdir -p `pwd`/scratch/tmp + gatk --java-options '-Xmx${params.memory_gather_vcfs} -Djava.io.tmpdir=`pwd`/scratch/tmp' GatherVcfs \\ + ${input_vcfs} \\ + -OUTPUT ${name}.mutect2.unfiltered.vcf.gz + tabix -p vcf ${name}.mutect2.unfiltered.vcf.gz + """ +} + +process GENOMICSDB_IMPORT { + cpus 2 + memory params.memory_genomicsdb_import + tag "${name}" + conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12" + + input: + tuple val(name), path(vcf), path(vcf_tbi) + + output: + val("Import completed") + + script: + if (! file("${params.genomicsdb_output}/pon.gdb").exists() || + file("${params.genomicsdb_output}/pon.gdb").list().length == 0) { + """ + mkdir -p `pwd`/scratch/tmp + gatk --java-options '-Xmx${params.memory_genomicsdb_import}' GenomicsDBImport \\ + --reference ${params.reference} \\ + --intervals ${params.intervals} \\ + --genomicsdb-workspace-path ${params.genomicsdb_output}/pon.gdb \\ + --tmp-dir `pwd`/scratch/tmp \\ + --genomicsdb-shared-posixfs-optimizations true \\ + --batch-size 50 \\ + --bypass-feature-reader \\ + --merge-input-intervals \\ + --variant ${vcf} + """ + } else { """ mkdir -p `pwd`/scratch/tmp - gatk --java-options '-Xmx${params.memory_create_genomics_db}' GenomicsDBImport \\ + gatk --java-options '-Xmx${params.memory_genomicsdb_import}' GenomicsDBImport \\ --reference ${params.reference} \\ - ${intervals_option} \\ - --genomicsdb-workspace-path genomics_db \\ + --intervals ${params.intervals} \\ + --genomicsdb-update-workspace-path ${params.genomicsdb_output}/pon.gdb \\ --tmp-dir `pwd`/scratch/tmp \\ --genomicsdb-shared-posixfs-optimizations true \\ --batch-size 50 \\ --bypass-feature-reader \\ - ${input_vcfs} + --merge-input-intervals \\ + --variant ${vcf} """ + } } process CREATE_PON { @@ -119,7 +167,7 @@ process CREATE_PON { conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12" input: - path(genomics_db) + val(genomicsdb_import_completed) output: file("pon.vcf") @@ -131,7 +179,7 @@ process CREATE_PON { gatk --java-options '-Xmx${params.memory_create_pon}' CreateSomaticPanelOfNormals \\ --reference ${params.reference} \\ --germline-resource ${params.gnomad} \\ - --variant gendb://${genomics_db} \\ + --variant gendb://${params.genomicsdb_output}/pon.gdb \\ --output pon.vcf \\ --tmp-dir `pwd`/scratch/tmp """ @@ -140,13 +188,11 @@ process CREATE_PON { // workflow definitions section workflow { - MUTECT2(input_files) + MUTECT2(input_files.combine(intervals)) + + GATHER_VCFS(MUTECT2.out.groupTuple()) - // CREATE_GENOMICS_DB( - // MUTECT2.out.vcf - // .map { name, vcf -> vcf } - // .collect() - // ) + GENOMICSDB_IMPORT(GATHER_VCFS.out) - // CREATE_PON(CREATE_GENOMICS_DB.out) + CREATE_PON(GENOMICSDB_IMPORT.out.collect()) } From bd216e2000eb6006b078538d8d56cb50616e7321 Mon Sep 17 00:00:00 2001 From: Julian Thomas Mohr Date: Sun, 24 Mar 2024 14:49:00 +0100 Subject: [PATCH 4/8] Execute GenomicsDBImport only once for all sample VCFs - first working version --- panel_of_normals/main.nf | 73 ++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/panel_of_normals/main.nf b/panel_of_normals/main.nf index afacd0e..f32194f 100755 --- a/panel_of_normals/main.nf +++ b/panel_of_normals/main.nf @@ -6,17 +6,16 @@ nextflow.enable.dsl = 2 // params definitions section params.help= false -params.gnomad = false params.input_files = false +params.reference = false params.intervals = false +params.gnomad = false +params.output = "output" params.max_mnp_distance = 0 params.memory_mutect2 = "16g" params.memory_gather_vcfs = "32g" params.memory_genomicsdb_import = "16g" params.memory_create_pon = "32g" -params.reference = false -params.output = "output" -params.genomicsdb_output = "output" // checks of required input arguments @@ -62,6 +61,7 @@ process MUTECT2 { cpus 2 memory params.memory_mutect2 tag "${name}-${interval}" + publishDir "${params.output}/logs/${name}", pattern: "*.mutect2.log", mode: "copy" conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12" @@ -69,7 +69,8 @@ process MUTECT2 { tuple val(name), val(bam), path(interval) output: - tuple val("${name}"), path("${name}.${interval.baseName}.mutect2.unfiltered.vcf.gz") + tuple val("${name}"), path("${name}.${interval.baseName}.mutect2.unfiltered.vcf.gz"), emit: vcf + path("${name}.${interval.baseName}.mutect2.log"), emit: log script: """ @@ -82,6 +83,7 @@ process MUTECT2 { --max-mnp-distance ${params.max_mnp_distance} \\ --output ${name}.${interval.baseName}.mutect2.unfiltered.vcf.gz \\ --tmp-dir `pwd`/scratch/tmp + cp .command.log ${name}.${interval.baseName}.mutect2.log """ } @@ -89,6 +91,8 @@ process GATHER_VCFS { cpus 1 memory params.memory_gather_vcfs tag "${name}" + publishDir "${params.output}", pattern: "*.mutect2.unfiltered.vcf*", mode: "copy" + publishDir "${params.output}/logs/${name}", pattern: "*.gathervcfs.log", mode: "copy" conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12" @@ -96,7 +100,9 @@ process GATHER_VCFS { tuple val(name), val(vcf_list) output: - tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf.gz"), path("${name}.mutect2.unfiltered.vcf.gz.tbi") + tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf.gz"), emit: vcf + tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf.gz.tbi"), emit: vcf_tbi + path("${name}.gathervcfs.log"), emit: log script: // NOTE: the input VCFs need to be provided in order by genomic coordinates @@ -109,69 +115,59 @@ process GATHER_VCFS { ${input_vcfs} \\ -OUTPUT ${name}.mutect2.unfiltered.vcf.gz tabix -p vcf ${name}.mutect2.unfiltered.vcf.gz + cp .command.log ${name}.gathervcfs.log """ } process GENOMICSDB_IMPORT { cpus 2 memory params.memory_genomicsdb_import - tag "${name}" + tag "pon.gdb" + publishDir "${params.output}/logs", pattern: "genomicsdbimport.log", mode: "copy" conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12" input: - tuple val(name), path(vcf), path(vcf_tbi) + val(vcf_list) output: - val("Import completed") + path("pon.gdb"), emit: genomicsdb + path("genomicsdbimport.log"), emit: log script: - if (! file("${params.genomicsdb_output}/pon.gdb").exists() || - file("${params.genomicsdb_output}/pon.gdb").list().length == 0) { - """ - mkdir -p `pwd`/scratch/tmp - gatk --java-options '-Xmx${params.memory_genomicsdb_import}' GenomicsDBImport \\ - --reference ${params.reference} \\ - --intervals ${params.intervals} \\ - --genomicsdb-workspace-path ${params.genomicsdb_output}/pon.gdb \\ - --tmp-dir `pwd`/scratch/tmp \\ - --genomicsdb-shared-posixfs-optimizations true \\ - --batch-size 50 \\ - --bypass-feature-reader \\ - --merge-input-intervals \\ - --variant ${vcf} - """ - } else { + input_vcfs = vcf_list.collect{"--variant " + it}.join(" ") """ mkdir -p `pwd`/scratch/tmp gatk --java-options '-Xmx${params.memory_genomicsdb_import}' GenomicsDBImport \\ --reference ${params.reference} \\ --intervals ${params.intervals} \\ - --genomicsdb-update-workspace-path ${params.genomicsdb_output}/pon.gdb \\ + --genomicsdb-workspace-path pon.gdb \\ --tmp-dir `pwd`/scratch/tmp \\ --genomicsdb-shared-posixfs-optimizations true \\ --batch-size 50 \\ --bypass-feature-reader \\ --merge-input-intervals \\ - --variant ${vcf} + ${input_vcfs} + cp .command.log genomicsdbimport.log """ - } } process CREATE_PON { cpus 1 memory params.memory_create_pon tag "pon.vcf" - publishDir "${params.output}", mode: "copy" + publishDir "${params.output}", pattern: "pon.vcf*", mode: "copy" + publishDir "${params.output}/logs", pattern: "createsomaticpanelofnormals.log", mode: "copy" conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12" input: - val(genomicsdb_import_completed) + path(genomics_db) output: - file("pon.vcf") - file("pon.vcf.idx") + path("pon.vcf") + path("pon.vcf.idx") + path("createsomaticpanelofnormals.log") script: """ @@ -179,9 +175,10 @@ process CREATE_PON { gatk --java-options '-Xmx${params.memory_create_pon}' CreateSomaticPanelOfNormals \\ --reference ${params.reference} \\ --germline-resource ${params.gnomad} \\ - --variant gendb://${params.genomicsdb_output}/pon.gdb \\ + --variant gendb://${genomics_db} \\ --output pon.vcf \\ --tmp-dir `pwd`/scratch/tmp + cp .command.log createsomaticpanelofnormals.log """ } @@ -190,9 +187,13 @@ process CREATE_PON { workflow { MUTECT2(input_files.combine(intervals)) - GATHER_VCFS(MUTECT2.out.groupTuple()) + GATHER_VCFS(MUTECT2.out.vcf.groupTuple()) - GENOMICSDB_IMPORT(GATHER_VCFS.out) + GENOMICSDB_IMPORT( + GATHER_VCFS.out.vcf + .map { name, vcf -> vcf } + .collect() + ) - CREATE_PON(GENOMICSDB_IMPORT.out.collect()) + CREATE_PON(GENOMICSDB_IMPORT.out.genomicsdb) } From 3bfc1179b0f7c1a50237187632d2fc2d6f680027 Mon Sep 17 00:00:00 2001 From: Julian Thomas Mohr Date: Sun, 24 Mar 2024 14:49:43 +0100 Subject: [PATCH 5/8] Update help message in config file --- panel_of_normals/nextflow.config | 34 ++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/panel_of_normals/nextflow.config b/panel_of_normals/nextflow.config index 4c7412a..9f7de1f 100644 --- a/panel_of_normals/nextflow.config +++ b/panel_of_normals/nextflow.config @@ -1,6 +1,6 @@ /* * ------------------------------------------------- - * TRON-Bioinformatics/tronflow-mutect2 Nextflow config file + * TRON-Bioinformatics/tronflow-mutect2/panel_of_normals Nextflow config file * ------------------------------------------------- * Default config options for all environments. */ @@ -44,10 +44,10 @@ VERSION = "1.8.0" DOI = "https://zenodo.org/badge/latestdoi/355860788" manifest { - name = "TRON-Bioinformatics/tronflow-mutect2" + name = "TRON-Bioinformatics/tronflow-mutect2/panel_of_normals" author = "Pablo Riesgo-Ferreiro, Özlem Muslu, Luisa Bresadola, Julian Thomas Mohr" homePage = "https://github.com/TRON-Bioinformatics/tronflow-mutect2" - description = "Mutect2 best practices workflow to create a panel of normals" + description = "MuTect2 best practices workflow to create a panel of normals" mainScript = "main.nf" nextflowVersion = ">=19.10.0" version = VERSION @@ -57,25 +57,33 @@ params.manifest = manifest params.help_message = """ ${params.manifest.name} v${params.manifest.version} -Author: ${params.manifest.author} +Authors: ${params.manifest.author} ${params.manifest.description} Usage: - nextflow run ${params.manifest.mainScript} --input_files input_files - -This workflow aims to compute a panel of normals to be used with MuTect2 + nextflow run ${params.manifest.mainScript} --input_files INPUT_FILES --reference REFERENCE --intervals INTERVALS --gnomad GNOMAD [--output [OUTPUT]] [--max_mnp_distance [MAX_MNP_DISTANCE]] [--memory_mutect2 [MEMORY_MUTECT2]] [--memory_gather_vcfs [MEMORY_GATHER_VCFS]] [--memory_genomicsdb_import [MEMORY_GENOMICSDB_IMPORT]] [--memory_create_pon [MEMORY_CREATE_PON]] Input: - * input_files: the path to a file containing in each row the sample name and the path to a BAM file to be included in the PON + * input_files: the path to a tab-separated values file containing in each row the sample name and the path to a BAM file to be considered for computing the PON (indexes expected *.bai) example: - sample1 /path/to/sample1.bam - sample2 /path/to/sample2.bam - NOTE: the sample name must be set in the @SN annotation + sample1 /path/to/sample1.bam + sample2 /path/to/sample2.bam + NOTE: the sample name must be set in the @SM annotation of the BAM header + * reference: the path to the FASTA genome reference (indexes expected *.fai, *.dict) + * intervals: the path to a BED file containing the genomic regions to analyze + * gnomad: the path to the gnomad VCF or other germline resource Optional input: - * output: the folder where to publish output + * output: the folder where to publish output (default: output) + * max_mnp_distance: maximum MNP distance that will be passed to Mutect2 (default: 0, recommended for creating a PON) + * memory_mutect2: the ammount of memory used by Mutect2 (default: 16g) + * memory_gather_vcfs: the ammount of memory used by Picard's GatherVcfs (default: 32g) + * memory_genomicsdb_import: the ammount of memory used by GenomicsDBImport (default: 16g) + * memory_create_pon: the ammount of memory used by CreateSomaticPanelOfNormals (default: 32g) Output: - * Output combined VCF pon.vcf + * Output VCF files of single samples and TBI indexes + * Output combined VCF `pon.vcf` and index `pon.vcf.idx` + * Log files of all processes in `logs/` directory """ From ba6b3b3fd52f9b3058250b3353a899966fcec318 Mon Sep 17 00:00:00 2001 From: Julian Thomas Mohr Date: Tue, 26 Mar 2024 13:36:10 +0100 Subject: [PATCH 6/8] Update README regarding new PON pipeline --- README.md | 58 +++++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 8eb7a67..c146ed4 100644 --- a/README.md +++ b/README.md @@ -138,28 +138,16 @@ java -jar /code/picard/2.21.2/picard.jar LiftoverVcf INPUT=/projects/data/gatk_b ### Panel Of Normals (PON) -The PON is used to filter out technical artifacts from the somatic variant calls. The PON is recommended to be formed from technically similar samples (ie: same sequencing platform, same sample preparation), from healthy and young individuals and to be formed by a minimum of 40 samples (see https://gatkforums.broadinstitute.org/gatk/discussion/11053/panel-of-normals-pon ). +The PON is used to filter out technical artifacts from the somatic variant calls. The PON is recommended to be formed from technically similar samples (ie: same sequencing platform, same sample preparation), from healthy and young individuals and to be formed by a minimum of 40 samples (see [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON) for further details). The normal samples are processed by the BAM preprocessing pipeline including marking duplicates and BQSR. -Run MuTect2 on each normal sample as follows: +To compute the PON, we implemented a best practice [Nextflow pipeline](panel_of_normals/main.nf) according to the GATK documentation: -``` -java -Xmx16g -jar /code/gatk/4.1.3.0/gatk-package-4.1.3.0-local.jar \ - Mutect2 \ - --reference ${params.reference} \ - --intervals ${interval} \ - --input ${bam} \ - --tumor-sample ${name} \ - --max-mnp-distance 0 \ - --output ${bam.baseName}.${interval.baseName}.mutect.vcf -``` - -Note the parameter "--max-mnp-distance 0" is needed to avoid MNPs being called. +- [(How to) Call somatic mutations using GATK4 Mutect2](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132--How-to-Call-somatic-mutations-using-GATK4-Mutect2) +- [Panel of Normals (PON)](https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON) -The multiple VCFs need to be combined with the GATK tool "CreateSomaticPanelOfNormals". - -This is implemented in the pipeline `mutect2_pon.vcf`. +Please note that the new best practice workflow according to the GATK documentation (see also [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132--How-to-Call-somatic-mutations-using-GATK4-Mutect2)) now uses GenomicsDB for scalability. Therefore, we included a new step for `GenomicsDBImport` in our pipeline. ### Configuring Funcotator @@ -171,25 +159,37 @@ Also, make sure that the reference version provided to funcotator with `--refere ## How to run the Panel of Normals (PON) pipeline -``` -$ nextflow mutect2_pon.nf --help -Usage: - mutect2_pon.nf --input_files input_files +```bash +TRON-Bioinformatics/tronflow-mutect2/panel_of_normals v1.8.0 +Authors: Pablo Riesgo-Ferreiro, Özlem Muslu, Luisa Bresadola, Julian Thomas Mohr + +MuTect2 best practices workflow to create a panel of normals -This workflow aims to compute a panel of normals to be used with MuTect2 +Usage: + nextflow run main.nf --input_files INPUT_FILES --reference REFERENCE --intervals INTERVALS --gnomad GNOMAD [--output [OUTPUT]] [--max_mnp_distance [MAX_MNP_DISTANCE]] [--memory_mutect2 [MEMORY_MUTECT2]] [--memory_gather_vcfs [MEMORY_GATHER_VCFS]] [--memory_genomicsdb_import [MEMORY_GENOMICSDB_IMPORT]] [--memory_create_pon [MEMORY_CREATE_PON]] Input: - * input_files: the path to a file containing in each row the sample name and the path to a BAM file to be included in the PON - example: - sample1 /path/to/sample1.bam - sample2 /path/to/sample2.bam - NOTE: the sample name must be set in the @SN annotation + * input_files: the path to a tab-separated values file containing in each row the sample name and the path to a BAM file to be considered for computing the PON (indexes expected *.bai) + example: + sample1 /path/to/sample1.bam + sample2 /path/to/sample2.bam + NOTE: the sample name must be set in the @SM annotation of the BAM header + * reference: the path to the FASTA genome reference (indexes expected *.fai, *.dict) + * intervals: the path to a BED file containing the genomic regions to analyze + * gnomad: the path to the gnomad VCF or other germline resource Optional input: - * output: the folder where to publish output + * output: the folder where to publish output (default: output) + * max_mnp_distance: maximum MNP distance that will be passed to Mutect2 (default: 0, recommended for creating a PON) + * memory_mutect2: the ammount of memory used by Mutect2 (default: 16g) + * memory_gather_vcfs: the ammount of memory used by Picard's GatherVcfs (default: 32g) + * memory_genomicsdb_import: the ammount of memory used by GenomicsDBImport (default: 16g) + * memory_create_pon: the ammount of memory used by CreateSomaticPanelOfNormals (default: 32g) Output: - * Output combined VCF pon.vcf + * Output VCF files of single samples and TBI indexes + * Output combined VCF `pon.vcf` and index `pon.vcf.idx` + * Log files of all processes in `logs/` directory ``` ## References From 97a59f940a94daf176f4dfb8560cfe73d375e72f Mon Sep 17 00:00:00 2001 From: Julian Thomas Mohr Date: Tue, 26 Mar 2024 13:39:58 +0100 Subject: [PATCH 7/8] Add reference to GenomicsDBImport usage and performance guidelines to README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c146ed4..0e0e3cb 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,7 @@ To compute the PON, we implemented a best practice [Nextflow pipeline](panel_of_ - [(How to) Call somatic mutations using GATK4 Mutect2](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132--How-to-Call-somatic-mutations-using-GATK4-Mutect2) - [Panel of Normals (PON)](https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON) -Please note that the new best practice workflow according to the GATK documentation (see also [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132--How-to-Call-somatic-mutations-using-GATK4-Mutect2)) now uses GenomicsDB for scalability. Therefore, we included a new step for `GenomicsDBImport` in our pipeline. +Please note that the new best practice workflow according to the GATK documentation (see also [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132--How-to-Call-somatic-mutations-using-GATK4-Mutect2)) now uses GenomicsDB for scalability. Therefore, we included a new step for `GenomicsDBImport` in our pipeline. To improve the performance of this step, we applied the recommendations according to GATK's [GenomicsDBImport usage and performance guidelines](https://gatk.broadinstitute.org/hc/en-us/articles/360056138571-GenomicsDBImport-usage-and-performance-guidelines). ### Configuring Funcotator From 4fd55de9a038123a5bed9ac45dfa6aff665a450e Mon Sep 17 00:00:00 2001 From: lkress <35309241+LKress@users.noreply.github.com> Date: Fri, 14 Jun 2024 11:41:02 +0200 Subject: [PATCH 8/8] Update java version in automated_tests.yml * Required as nextflow version was built against a non-canonical more recent java version --- .github/workflows/automated_tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/automated_tests.yml b/.github/workflows/automated_tests.yml index a38aaff..39e3dc6 100644 --- a/.github/workflows/automated_tests.yml +++ b/.github/workflows/automated_tests.yml @@ -8,10 +8,10 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: actions/setup-java@v2 + - uses: actions/setup-java@v3 with: distribution: 'zulu' # See 'Supported distributions' for available options - java-version: '8' + java-version: '11' - uses: conda-incubator/setup-miniconda@v2 with: auto-update-conda: true @@ -28,4 +28,4 @@ jobs: key: ${{ runner.os }}-tronflow-mutect2 - name: Run tests run: | - make \ No newline at end of file + make