From 76b0f6cbc7a604aed137e6ed9f8318f5fdc2aa28 Mon Sep 17 00:00:00 2001
From: Julian Thomas Mohr <julian.mohr@tron-mainz.de>
Date: Sat, 23 Mar 2024 13:19:58 +0100
Subject: [PATCH 1/8] Update MuTect2 PON script and move it to separate dir

---
 mutect2_pon.nf                   | 132 ----------------------------
 panel_of_normals/main.nf         | 145 +++++++++++++++++++++++++++++++
 panel_of_normals/nextflow.config |  81 +++++++++++++++++
 3 files changed, 226 insertions(+), 132 deletions(-)
 delete mode 100755 mutect2_pon.nf
 create mode 100755 panel_of_normals/main.nf
 create mode 100644 panel_of_normals/nextflow.config

diff --git a/mutect2_pon.nf b/mutect2_pon.nf
deleted file mode 100755
index bc10b8e..0000000
--- a/mutect2_pon.nf
+++ /dev/null
@@ -1,132 +0,0 @@
-#!/usr/bin/env nextflow
-
-gatk4_jar = "/code/gatk/4.1.3.0/gatk-package-4.1.3.0-local.jar"
-// this version is needed to build a PON from multiple VCFs, it may change in the future
-gatk40_jar = "/code/gatk/4.0.12.0/gatk-package-4.0.12.0-local.jar"
-picard_jar = "/code/picard/2.21.2/picard.jar"
-
-params.help= false
-params.input_files = false
-params.reference = "/projects/data/gatk_bundle/hg19/ucsc.hg19.fasta"							// TODO: remove this hard coded bit
-params.intervals = "/projects/data/gatk_bundle/hg19/hg19_refseq_exons.sorted.merged.bed"
-params.output = 'output'
-
-def helpMessage() {
-    log.info"""
-Usage:
-    mutect2_pon.nf --input_files input_files
-
-This workflow aims to compute a panel of normals to be used with MuTect2
-
-Input:
-    * input_files: the path to a file containing in each row the sample name and the path to a BAM file to be included in the PON
-    	example:
-	sample1	/path/to/sample1.bam
-	sample2	/path/to/sample2.bam
-	NOTE: the sample name must be set in the @SN annotation
-
-Optional input:
-    * output: the folder where to publish output
-
-Output:
-    * Output combined VCF pon.vcf
-    """
-}
-
-if (params.help) {
-    helpMessage()
-    exit 0
-}
-
-// checks required inputs
-if (params.input_files) {
-  Channel
-    .fromPath(params.input_files)
-    .splitCsv(header: ['name', 'bam'], sep: "\t")
-    .map { row-> tuple(row.name, file(row.bam), file(row.bam.replaceAll(/.bam$/, ".bai"))) }
-    .set { input_files }
-} else {
-  exit 1, "Input file not specified!"
-}
-
-if (params.intervals) {
-  Channel
-    .fromPath(params.intervals)
-    .splitText(by: 30000, file: true)
-    .set { intervals }
-}
-
-process mutect2Pon {
-    cpus 2
-    memory '16g'
-    module 'java/1.8.0'
-    errorStrategy 'finish'
-
-    input:
-    	set val(name), file(bam), file(bai), file(interval) from input_files.combine(intervals)
-
-    output:
-	    set val("${bam.baseName}"), file("${bam.baseName}.${interval.baseName}.mutect.vcf")  into mutect_vcfs
-
-    """
-    mkdir -p `pwd`/scratch/tmp
-    java -Xmx16g -Djava.io.tmpdir=`pwd`/scratch/tmp -jar ${gatk4_jar} \
-    Mutect2 \
-    --reference ${params.reference} \
-    --intervals ${interval} \
-    --input ${bam} \
-    --tumor-sample ${name} \
-    --max-mnp-distance 0 \
-    --output ${bam.baseName}.${interval.baseName}.mutect.vcf
-    """
-}
-
-process gatherVcfs {
-    cpus 1
-    memory '32g'
-    module 'java/1.8.0'
-    publishDir "${params.output}", mode: "copy"
-
-    input:
-    	set name, file(vcf_list) from mutect_vcfs.groupTuple()	// group by name
-
-    output:
-    	file("${name}.vcf") into whole_vcfs
-
-    script:
-      	// NOTE: the input VCFs need to be provided in order by genomic coordinates
-    	input_vcfs = "$vcf_list".split(" ")
-        .sort{ a, b -> a.tokenize(".")[-3].toInteger().compareTo b.tokenize(".")[-3].toInteger() }
-        .collect{"INPUT=" + it}.join(" ")
-    	"""
-      	mkdir -p `pwd`/scratch/tmp
-    	java -Xmx32g -Djava.io.tmpdir=`pwd`/scratch/tmp  -jar $picard_jar \
-    	GatherVcfs \
-      	${input_vcfs} \
-      	OUTPUT=${name}.vcf
-	    """
-}
-
-process createPON {
-    cpus 1
-    memory '32g'
-    module 'java/1.8.0'
-    publishDir "${params.output}", mode: "copy"
-
-    input:
-    	file(vcf_list) from whole_vcfs.collect()
-
-    output:
-    	file("pon.vcf")
-      file("pon.vcf.idx")
-
-    script:
-    	input_vcfs = "$vcf_list".split(" ").collect{"--vcfs " + it}.join(" ")
-    	"""
-      # combines VCFs and keeps variants occuring in at least two VCFs
-	    mkdir -p `pwd`/scratch/tmp
-    	java -Xmx32g -Djava.io.tmpdir=`pwd`/scratch/tmp  -jar $gatk40_jar \
-    	CreateSomaticPanelOfNormals ${input_vcfs} \
-      --output pon.vcf
-    	"""
-}
diff --git a/panel_of_normals/main.nf b/panel_of_normals/main.nf
new file mode 100755
index 0000000..ac0a267
--- /dev/null
+++ b/panel_of_normals/main.nf
@@ -0,0 +1,145 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+
+// params definitions section
+
+params.help= false
+params.gnomad = false
+params.input_files = false
+params.intervals = false
+params.max_mnp_distance = 0
+params.memory_mutect2 = "16g"
+params.memory_create_genomics_db = "8g"
+params.memory_create_pon = "32g"
+params.reference = false
+params.output = "output"
+
+// checks of required input arguments
+
+def helpMessage() {
+    log.info params.help_message
+}
+
+if (params.help) {
+    helpMessage()
+    exit 0
+}
+
+if (params.input_files) {
+    Channel
+        .fromPath(params.input_files)
+        .splitCsv(header: ["name", "bam"], sep: "\t")
+        .map { row-> tuple(row.name, row.bam) }
+        .set { input_files }
+} else {
+    exit 1, "The following argument is required: --input_files!"
+}
+
+if (! params.reference) {
+    exit 1, "The following argument is required: --reference!"
+}
+
+if (! params.gnomad) {
+    exit 1, "The following argument is required: --gnomad!"
+}
+
+// process definitions section
+
+process MUTECT2 {
+    cpus 2
+    memory params.memory_mutect2
+    tag "${name}"
+    publishDir "${params.output}/${name}", mode: "copy"
+
+    conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12"
+
+    input:
+    	tuple val(name), val(bam)
+
+    output:
+	    tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf"), emit: vcf
+        tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf.idx"), emit: vcf_idx
+        tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf.stats"), emit: vcf_stats
+
+    script:
+        intervals_option = params.intervals ? "--intervals ${params.intervals}" : ""
+
+        """
+        gatk --java-options '-Xmx${params.memory_mutect2}' Mutect2 \\
+            --reference ${params.reference} \\
+            ${intervals_option} \\
+            --input ${bam} \\
+            --tumor-sample ${name} \\
+            --max-mnp-distance ${params.max_mnp_distance} \\
+            --output ${name}.mutect2.unfiltered.vcf
+        """
+}
+
+process CREATE_GENOMICS_DB {
+    cpus 1
+    memory params.memory_create_genomics_db
+    tag "genomics_db"
+
+    conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12"
+
+    input:
+    	val(vcf_list)
+
+    output:
+	    path("genomics_db", type: "dir")
+
+    script:
+        intervals_option = params.intervals ? "--intervals ${params.intervals}" : ""
+        input_vcfs = vcf_list.collect{"--variant " + it}.join(" ")
+
+        """
+        mkdir -p `pwd`/scratch/tmp
+        gatk --java-options '-Xmx${params.memory_create_genomics_db}' GenomicsDBImport \\
+            --reference ${params.reference} \\
+            ${intervals_option} \\
+            --genomicsdb-workspace-path genomics_db \\
+            --tmp-dir `pwd`/scratch/tmp \\
+            ${input_vcfs}
+        """
+}
+
+process CREATE_PON {
+    cpus 1
+    memory params.memory_create_pon
+    tag "pon.vcf"
+    publishDir "${params.output}", mode: "copy"
+
+    conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12"
+
+    input:
+    	path(genomics_db)
+
+    output:
+    	file("pon.vcf")
+        file("pon.vcf.idx")
+
+    script:
+    	"""
+        gatk --java-options '-Xmx${params.memory_create_pon}' CreateSomaticPanelOfNormals \\
+            --reference ${params.reference} \\
+            --germline-resource ${params.gnomad} \\
+            --variant gendb://${genomics_db} \\
+            --output pon.vcf
+    	"""
+}
+
+// workflow definitions section
+
+workflow {
+    MUTECT2(input_files)
+
+    CREATE_GENOMICS_DB(
+        MUTECT2.out.vcf
+            .map { name, vcf -> vcf }
+            .collect()
+    )
+
+    CREATE_PON(CREATE_GENOMICS_DB.out)
+}
diff --git a/panel_of_normals/nextflow.config b/panel_of_normals/nextflow.config
new file mode 100644
index 0000000..4c7412a
--- /dev/null
+++ b/panel_of_normals/nextflow.config
@@ -0,0 +1,81 @@
+/*
+ * -------------------------------------------------
+ *  TRON-Bioinformatics/tronflow-mutect2 Nextflow config file
+ * -------------------------------------------------
+ * Default config options for all environments.
+ */
+
+profiles {
+    conda {
+        params.enable_conda         = true
+        conda.enabled               = true
+        conda.createTimeout         = "120 min"
+    }
+    debug {
+        cleanup                     = false
+        dumpHashes                  = true
+        process.beforeScript        = "echo $HOSTNAME" 
+    }
+    mamba {
+        params.enable_conda         = true
+        conda.enabled               = true
+        conda.createTimeout         = "120 min"
+        conda.useMamba              = true
+    }
+    test {
+        timeline.enabled            = false
+        report.enabled              = false
+        trace.enabled               = false
+        dag.enabled                 = false
+    }
+}
+
+// export these variables to prevent local Python/R libraries from conflicting with those in the container
+env {
+    PYTHONNOUSERSITE    = 1
+    R_PROFILE_USER      = "/.Rprofile"
+    R_ENVIRON_USER      = "/.Renviron"
+}
+
+// capture exit codes from upstream processes when piping
+process.shell = ["/bin/bash", "-euo", "pipefail"]
+
+VERSION     = "1.8.0"
+DOI         = "https://zenodo.org/badge/latestdoi/355860788"
+
+manifest {
+    name                = "TRON-Bioinformatics/tronflow-mutect2"
+    author              = "Pablo Riesgo-Ferreiro, Özlem Muslu, Luisa Bresadola, Julian Thomas Mohr"
+    homePage            = "https://github.com/TRON-Bioinformatics/tronflow-mutect2"
+    description         = "Mutect2 best practices workflow to create a panel of normals"
+    mainScript          = "main.nf"
+    nextflowVersion     = ">=19.10.0"
+    version             = VERSION
+    doi                 = DOI
+}
+params.manifest = manifest
+
+params.help_message = """
+${params.manifest.name} v${params.manifest.version}
+Author: ${params.manifest.author}
+
+${params.manifest.description}
+
+Usage:
+    nextflow run ${params.manifest.mainScript} --input_files input_files
+
+This workflow aims to compute a panel of normals to be used with MuTect2
+
+Input:
+    * input_files: the path to a file containing in each row the sample name and the path to a BAM file to be included in the PON
+    	example:
+	sample1	/path/to/sample1.bam
+	sample2	/path/to/sample2.bam
+	NOTE: the sample name must be set in the @SN annotation
+
+Optional input:
+    * output: the folder where to publish output
+
+Output:
+    * Output combined VCF pon.vcf
+"""

From 7efb9a0478b21e51a41cc3611cfa1272322c9dd5 Mon Sep 17 00:00:00 2001
From: Julian Thomas Mohr <julian.mohr@tron-mainz.de>
Date: Sat, 23 Mar 2024 13:21:07 +0100
Subject: [PATCH 2/8] Introduce some performance improvements

---
 panel_of_normals/main.nf | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/panel_of_normals/main.nf b/panel_of_normals/main.nf
index ac0a267..2541f93 100755
--- a/panel_of_normals/main.nf
+++ b/panel_of_normals/main.nf
@@ -11,7 +11,7 @@ params.input_files = false
 params.intervals = false
 params.max_mnp_distance = 0
 params.memory_mutect2 = "16g"
-params.memory_create_genomics_db = "8g"
+params.memory_create_genomics_db = "16g"
 params.memory_create_pon = "32g"
 params.reference = false
 params.output = "output"
@@ -67,18 +67,20 @@ process MUTECT2 {
         intervals_option = params.intervals ? "--intervals ${params.intervals}" : ""
 
         """
+        mkdir -p `pwd`/scratch/tmp
         gatk --java-options '-Xmx${params.memory_mutect2}' Mutect2 \\
             --reference ${params.reference} \\
             ${intervals_option} \\
             --input ${bam} \\
             --tumor-sample ${name} \\
             --max-mnp-distance ${params.max_mnp_distance} \\
-            --output ${name}.mutect2.unfiltered.vcf
+            --output ${name}.mutect2.unfiltered.vcf \\
+            --tmp-dir `pwd`/scratch/tmp
         """
 }
 
 process CREATE_GENOMICS_DB {
-    cpus 1
+    cpus 2
     memory params.memory_create_genomics_db
     tag "genomics_db"
 
@@ -101,6 +103,9 @@ process CREATE_GENOMICS_DB {
             ${intervals_option} \\
             --genomicsdb-workspace-path genomics_db \\
             --tmp-dir `pwd`/scratch/tmp \\
+            --genomicsdb-shared-posixfs-optimizations true \\
+            --batch-size 50 \\
+            --bypass-feature-reader \\
             ${input_vcfs}
         """
 }
@@ -122,11 +127,13 @@ process CREATE_PON {
 
     script:
     	"""
+        mkdir -p `pwd`/scratch/tmp
         gatk --java-options '-Xmx${params.memory_create_pon}' CreateSomaticPanelOfNormals \\
             --reference ${params.reference} \\
             --germline-resource ${params.gnomad} \\
             --variant gendb://${genomics_db} \\
-            --output pon.vcf
+            --output pon.vcf \\
+            --tmp-dir `pwd`/scratch/tmp
     	"""
 }
 
@@ -135,11 +142,11 @@ process CREATE_PON {
 workflow {
     MUTECT2(input_files)
 
-    CREATE_GENOMICS_DB(
-        MUTECT2.out.vcf
-            .map { name, vcf -> vcf }
-            .collect()
-    )
+    // CREATE_GENOMICS_DB(
+    //     MUTECT2.out.vcf
+    //         .map { name, vcf -> vcf }
+    //         .collect()
+    // )
 
-    CREATE_PON(CREATE_GENOMICS_DB.out)
+    // CREATE_PON(CREATE_GENOMICS_DB.out)
 }

From 47f514f2eec48d1988f566b28713713d47462cf1 Mon Sep 17 00:00:00 2001
From: Julian Thomas Mohr <julian.mohr@tron-mainz.de>
Date: Sun, 24 Mar 2024 12:20:54 +0100
Subject: [PATCH 3/8] Try to parallelize steps of pipeline

---
 panel_of_normals/main.nf | 110 +++++++++++++++++++++++++++------------
 1 file changed, 78 insertions(+), 32 deletions(-)

diff --git a/panel_of_normals/main.nf b/panel_of_normals/main.nf
index 2541f93..afacd0e 100755
--- a/panel_of_normals/main.nf
+++ b/panel_of_normals/main.nf
@@ -11,10 +11,12 @@ params.input_files = false
 params.intervals = false
 params.max_mnp_distance = 0
 params.memory_mutect2 = "16g"
-params.memory_create_genomics_db = "16g"
+params.memory_gather_vcfs = "32g"
+params.memory_genomicsdb_import = "16g"
 params.memory_create_pon = "32g"
 params.reference = false
 params.output = "output"
+params.genomicsdb_output = "output"
 
 // checks of required input arguments
 
@@ -45,69 +47,115 @@ if (! params.gnomad) {
     exit 1, "The following argument is required: --gnomad!"
 }
 
+if (params.intervals) {
+    Channel
+        .fromPath(params.intervals)
+        .splitText(by: 30000, file: true)
+        .set { intervals }
+} else {
+    exit 1, "The following argument is required: --intervals!"
+}
+
 // process definitions section
 
 process MUTECT2 {
     cpus 2
     memory params.memory_mutect2
-    tag "${name}"
-    publishDir "${params.output}/${name}", mode: "copy"
+    tag "${name}-${interval}"
 
     conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12"
 
     input:
-    	tuple val(name), val(bam)
+    	tuple val(name), val(bam), path(interval)
 
     output:
-	    tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf"), emit: vcf
-        tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf.idx"), emit: vcf_idx
-        tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf.stats"), emit: vcf_stats
+	    tuple val("${name}"), path("${name}.${interval.baseName}.mutect2.unfiltered.vcf.gz")
 
     script:
-        intervals_option = params.intervals ? "--intervals ${params.intervals}" : ""
-
         """
         mkdir -p `pwd`/scratch/tmp
         gatk --java-options '-Xmx${params.memory_mutect2}' Mutect2 \\
             --reference ${params.reference} \\
-            ${intervals_option} \\
+            --intervals ${interval} \\
             --input ${bam} \\
             --tumor-sample ${name} \\
             --max-mnp-distance ${params.max_mnp_distance} \\
-            --output ${name}.mutect2.unfiltered.vcf \\
+            --output ${name}.${interval.baseName}.mutect2.unfiltered.vcf.gz \\
             --tmp-dir `pwd`/scratch/tmp
         """
 }
 
-process CREATE_GENOMICS_DB {
-    cpus 2
-    memory params.memory_create_genomics_db
-    tag "genomics_db"
+process GATHER_VCFS {
+    cpus 1
+    memory params.memory_gather_vcfs
+    tag "${name}"
 
     conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12"
 
     input:
-    	val(vcf_list)
+    	tuple val(name), val(vcf_list)
 
     output:
-	    path("genomics_db", type: "dir")
+	    tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf.gz"), path("${name}.mutect2.unfiltered.vcf.gz.tbi")
 
     script:
-        intervals_option = params.intervals ? "--intervals ${params.intervals}" : ""
-        input_vcfs = vcf_list.collect{"--variant " + it}.join(" ")
+        // NOTE: the input VCFs need to be provided in order by genomic coordinates
+    	input_vcfs = vcf_list
+            .sort { a, b -> file(a).baseName.compareTo(file(b).baseName) }
+            .collect{"-INPUT " + it}.join(" ")
+        """
+        mkdir -p `pwd`/scratch/tmp
+        gatk --java-options '-Xmx${params.memory_gather_vcfs} -Djava.io.tmpdir=`pwd`/scratch/tmp' GatherVcfs \\
+            ${input_vcfs} \\
+            -OUTPUT ${name}.mutect2.unfiltered.vcf.gz
+        tabix -p vcf ${name}.mutect2.unfiltered.vcf.gz
+        """
+}
+
+process GENOMICSDB_IMPORT {
+    cpus 2
+    memory params.memory_genomicsdb_import
+    tag "${name}"
 
+    conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12"
+
+    input:
+    	tuple val(name), path(vcf), path(vcf_tbi)
+
+    output:
+	    val("Import completed")
+
+    script:
+        if (! file("${params.genomicsdb_output}/pon.gdb").exists() ||  
+            file("${params.genomicsdb_output}/pon.gdb").list().length == 0) {
+        """
+        mkdir -p `pwd`/scratch/tmp
+        gatk --java-options '-Xmx${params.memory_genomicsdb_import}' GenomicsDBImport \\
+            --reference ${params.reference} \\
+            --intervals ${params.intervals} \\
+            --genomicsdb-workspace-path ${params.genomicsdb_output}/pon.gdb \\
+            --tmp-dir `pwd`/scratch/tmp \\
+            --genomicsdb-shared-posixfs-optimizations true \\
+            --batch-size 50 \\
+            --bypass-feature-reader \\
+            --merge-input-intervals \\
+            --variant ${vcf}
+        """
+        } else {
         """
         mkdir -p `pwd`/scratch/tmp
-        gatk --java-options '-Xmx${params.memory_create_genomics_db}' GenomicsDBImport \\
+        gatk --java-options '-Xmx${params.memory_genomicsdb_import}' GenomicsDBImport \\
             --reference ${params.reference} \\
-            ${intervals_option} \\
-            --genomicsdb-workspace-path genomics_db \\
+            --intervals ${params.intervals} \\
+            --genomicsdb-update-workspace-path ${params.genomicsdb_output}/pon.gdb \\
             --tmp-dir `pwd`/scratch/tmp \\
             --genomicsdb-shared-posixfs-optimizations true \\
             --batch-size 50 \\
             --bypass-feature-reader \\
-            ${input_vcfs}
+            --merge-input-intervals \\
+            --variant ${vcf}
         """
+        }
 }
 
 process CREATE_PON {
@@ -119,7 +167,7 @@ process CREATE_PON {
     conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12"
 
     input:
-    	path(genomics_db)
+        val(genomicsdb_import_completed)
 
     output:
     	file("pon.vcf")
@@ -131,7 +179,7 @@ process CREATE_PON {
         gatk --java-options '-Xmx${params.memory_create_pon}' CreateSomaticPanelOfNormals \\
             --reference ${params.reference} \\
             --germline-resource ${params.gnomad} \\
-            --variant gendb://${genomics_db} \\
+            --variant gendb://${params.genomicsdb_output}/pon.gdb \\
             --output pon.vcf \\
             --tmp-dir `pwd`/scratch/tmp
     	"""
@@ -140,13 +188,11 @@ process CREATE_PON {
 // workflow definitions section
 
 workflow {
-    MUTECT2(input_files)
+    MUTECT2(input_files.combine(intervals))
+
+    GATHER_VCFS(MUTECT2.out.groupTuple())
 
-    // CREATE_GENOMICS_DB(
-    //     MUTECT2.out.vcf
-    //         .map { name, vcf -> vcf }
-    //         .collect()
-    // )
+    GENOMICSDB_IMPORT(GATHER_VCFS.out)
 
-    // CREATE_PON(CREATE_GENOMICS_DB.out)
+    CREATE_PON(GENOMICSDB_IMPORT.out.collect())
 }

From bd216e2000eb6006b078538d8d56cb50616e7321 Mon Sep 17 00:00:00 2001
From: Julian Thomas Mohr <julian.mohr@tron-mainz.de>
Date: Sun, 24 Mar 2024 14:49:00 +0100
Subject: [PATCH 4/8] Execute GenomicsDBImport only once for all sample VCFs -
 first working version

---
 panel_of_normals/main.nf | 73 ++++++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 36 deletions(-)

diff --git a/panel_of_normals/main.nf b/panel_of_normals/main.nf
index afacd0e..f32194f 100755
--- a/panel_of_normals/main.nf
+++ b/panel_of_normals/main.nf
@@ -6,17 +6,16 @@ nextflow.enable.dsl = 2
 // params definitions section
 
 params.help= false
-params.gnomad = false
 params.input_files = false
+params.reference = false
 params.intervals = false
+params.gnomad = false
+params.output = "output"
 params.max_mnp_distance = 0
 params.memory_mutect2 = "16g"
 params.memory_gather_vcfs = "32g"
 params.memory_genomicsdb_import = "16g"
 params.memory_create_pon = "32g"
-params.reference = false
-params.output = "output"
-params.genomicsdb_output = "output"
 
 // checks of required input arguments
 
@@ -62,6 +61,7 @@ process MUTECT2 {
     cpus 2
     memory params.memory_mutect2
     tag "${name}-${interval}"
+    publishDir "${params.output}/logs/${name}", pattern: "*.mutect2.log", mode: "copy"
 
     conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12"
 
@@ -69,7 +69,8 @@ process MUTECT2 {
     	tuple val(name), val(bam), path(interval)
 
     output:
-	    tuple val("${name}"), path("${name}.${interval.baseName}.mutect2.unfiltered.vcf.gz")
+	    tuple val("${name}"), path("${name}.${interval.baseName}.mutect2.unfiltered.vcf.gz"), emit: vcf
+        path("${name}.${interval.baseName}.mutect2.log"), emit: log
 
     script:
         """
@@ -82,6 +83,7 @@ process MUTECT2 {
             --max-mnp-distance ${params.max_mnp_distance} \\
             --output ${name}.${interval.baseName}.mutect2.unfiltered.vcf.gz \\
             --tmp-dir `pwd`/scratch/tmp
+        cp .command.log ${name}.${interval.baseName}.mutect2.log
         """
 }
 
@@ -89,6 +91,8 @@ process GATHER_VCFS {
     cpus 1
     memory params.memory_gather_vcfs
     tag "${name}"
+    publishDir "${params.output}", pattern: "*.mutect2.unfiltered.vcf*", mode: "copy"
+    publishDir "${params.output}/logs/${name}", pattern: "*.gathervcfs.log", mode: "copy"
 
     conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12"
 
@@ -96,7 +100,9 @@ process GATHER_VCFS {
     	tuple val(name), val(vcf_list)
 
     output:
-	    tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf.gz"), path("${name}.mutect2.unfiltered.vcf.gz.tbi")
+	    tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf.gz"), emit: vcf
+        tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf.gz.tbi"), emit: vcf_tbi
+        path("${name}.gathervcfs.log"), emit: log
 
     script:
         // NOTE: the input VCFs need to be provided in order by genomic coordinates
@@ -109,69 +115,59 @@ process GATHER_VCFS {
             ${input_vcfs} \\
             -OUTPUT ${name}.mutect2.unfiltered.vcf.gz
         tabix -p vcf ${name}.mutect2.unfiltered.vcf.gz
+        cp .command.log ${name}.gathervcfs.log
         """
 }
 
 process GENOMICSDB_IMPORT {
     cpus 2
     memory params.memory_genomicsdb_import
-    tag "${name}"
+    tag "pon.gdb"
+    publishDir "${params.output}/logs", pattern: "genomicsdbimport.log", mode: "copy"
 
     conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12"
 
     input:
-    	tuple val(name), path(vcf), path(vcf_tbi)
+    	val(vcf_list)
 
     output:
-	    val("Import completed")
+	    path("pon.gdb"), emit: genomicsdb
+        path("genomicsdbimport.log"), emit: log
 
     script:
-        if (! file("${params.genomicsdb_output}/pon.gdb").exists() ||  
-            file("${params.genomicsdb_output}/pon.gdb").list().length == 0) {
-        """
-        mkdir -p `pwd`/scratch/tmp
-        gatk --java-options '-Xmx${params.memory_genomicsdb_import}' GenomicsDBImport \\
-            --reference ${params.reference} \\
-            --intervals ${params.intervals} \\
-            --genomicsdb-workspace-path ${params.genomicsdb_output}/pon.gdb \\
-            --tmp-dir `pwd`/scratch/tmp \\
-            --genomicsdb-shared-posixfs-optimizations true \\
-            --batch-size 50 \\
-            --bypass-feature-reader \\
-            --merge-input-intervals \\
-            --variant ${vcf}
-        """
-        } else {
+        input_vcfs = vcf_list.collect{"--variant " + it}.join(" ")
         """
         mkdir -p `pwd`/scratch/tmp
         gatk --java-options '-Xmx${params.memory_genomicsdb_import}' GenomicsDBImport \\
             --reference ${params.reference} \\
             --intervals ${params.intervals} \\
-            --genomicsdb-update-workspace-path ${params.genomicsdb_output}/pon.gdb \\
+            --genomicsdb-workspace-path pon.gdb \\
             --tmp-dir `pwd`/scratch/tmp \\
             --genomicsdb-shared-posixfs-optimizations true \\
             --batch-size 50 \\
             --bypass-feature-reader \\
             --merge-input-intervals \\
-            --variant ${vcf}
+            ${input_vcfs}
+        cp .command.log genomicsdbimport.log
         """
-        }
 }
 
 process CREATE_PON {
     cpus 1
     memory params.memory_create_pon
     tag "pon.vcf"
-    publishDir "${params.output}", mode: "copy"
+    publishDir "${params.output}", pattern: "pon.vcf*", mode: "copy"
+    publishDir "${params.output}/logs", pattern: "createsomaticpanelofnormals.log", mode: "copy"
 
     conda "bioconda::gatk4=4.2.6.1 bioconda::samtools=1.12"
 
     input:
-        val(genomicsdb_import_completed)
+        path(genomics_db)
 
     output:
-    	file("pon.vcf")
-        file("pon.vcf.idx")
+    	path("pon.vcf")
+        path("pon.vcf.idx")
+        path("createsomaticpanelofnormals.log")
 
     script:
     	"""
@@ -179,9 +175,10 @@ process CREATE_PON {
         gatk --java-options '-Xmx${params.memory_create_pon}' CreateSomaticPanelOfNormals \\
             --reference ${params.reference} \\
             --germline-resource ${params.gnomad} \\
-            --variant gendb://${params.genomicsdb_output}/pon.gdb \\
+            --variant gendb://${genomics_db} \\
             --output pon.vcf \\
             --tmp-dir `pwd`/scratch/tmp
+        cp .command.log createsomaticpanelofnormals.log
     	"""
 }
 
@@ -190,9 +187,13 @@ process CREATE_PON {
 workflow {
     MUTECT2(input_files.combine(intervals))
 
-    GATHER_VCFS(MUTECT2.out.groupTuple())
+    GATHER_VCFS(MUTECT2.out.vcf.groupTuple())
 
-    GENOMICSDB_IMPORT(GATHER_VCFS.out)
+    GENOMICSDB_IMPORT(
+        GATHER_VCFS.out.vcf
+            .map { name, vcf -> vcf }
+            .collect()
+    )
 
-    CREATE_PON(GENOMICSDB_IMPORT.out.collect())
+    CREATE_PON(GENOMICSDB_IMPORT.out.genomicsdb)
 }

From 3bfc1179b0f7c1a50237187632d2fc2d6f680027 Mon Sep 17 00:00:00 2001
From: Julian Thomas Mohr <julian.mohr@tron-mainz.de>
Date: Sun, 24 Mar 2024 14:49:43 +0100
Subject: [PATCH 5/8] Update help message in config file

---
 panel_of_normals/nextflow.config | 34 ++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/panel_of_normals/nextflow.config b/panel_of_normals/nextflow.config
index 4c7412a..9f7de1f 100644
--- a/panel_of_normals/nextflow.config
+++ b/panel_of_normals/nextflow.config
@@ -1,6 +1,6 @@
 /*
  * -------------------------------------------------
- *  TRON-Bioinformatics/tronflow-mutect2 Nextflow config file
+ *  TRON-Bioinformatics/tronflow-mutect2/panel_of_normals Nextflow config file
  * -------------------------------------------------
  * Default config options for all environments.
  */
@@ -44,10 +44,10 @@ VERSION     = "1.8.0"
 DOI         = "https://zenodo.org/badge/latestdoi/355860788"
 
 manifest {
-    name                = "TRON-Bioinformatics/tronflow-mutect2"
+    name                = "TRON-Bioinformatics/tronflow-mutect2/panel_of_normals"
     author              = "Pablo Riesgo-Ferreiro, Özlem Muslu, Luisa Bresadola, Julian Thomas Mohr"
     homePage            = "https://github.com/TRON-Bioinformatics/tronflow-mutect2"
-    description         = "Mutect2 best practices workflow to create a panel of normals"
+    description         = "MuTect2 best practices workflow to create a panel of normals"
     mainScript          = "main.nf"
     nextflowVersion     = ">=19.10.0"
     version             = VERSION
@@ -57,25 +57,33 @@ params.manifest = manifest
 
 params.help_message = """
 ${params.manifest.name} v${params.manifest.version}
-Author: ${params.manifest.author}
+Authors: ${params.manifest.author}
 
 ${params.manifest.description}
 
 Usage:
-    nextflow run ${params.manifest.mainScript} --input_files input_files
-
-This workflow aims to compute a panel of normals to be used with MuTect2
+    nextflow run ${params.manifest.mainScript} --input_files INPUT_FILES --reference REFERENCE --intervals INTERVALS --gnomad GNOMAD [--output [OUTPUT]] [--max_mnp_distance [MAX_MNP_DISTANCE]] [--memory_mutect2 [MEMORY_MUTECT2]] [--memory_gather_vcfs [MEMORY_GATHER_VCFS]] [--memory_genomicsdb_import [MEMORY_GENOMICSDB_IMPORT]] [--memory_create_pon [MEMORY_CREATE_PON]]
 
 Input:
-    * input_files: the path to a file containing in each row the sample name and the path to a BAM file to be included in the PON
+    * input_files: the path to a tab-separated values file containing in each row the sample name and the path to a BAM file to be considered for computing the PON (indexes expected *.bai)
     	example:
-	sample1	/path/to/sample1.bam
-	sample2	/path/to/sample2.bam
-	NOTE: the sample name must be set in the @SN annotation
+        sample1 /path/to/sample1.bam
+        sample2 /path/to/sample2.bam
+        NOTE: the sample name must be set in the @SM annotation of the BAM header
+    * reference: the path to the FASTA genome reference (indexes expected *.fai, *.dict)
+    * intervals: the path to a BED file containing the genomic regions to analyze
+    * gnomad: the path to the gnomad VCF or other germline resource
 
 Optional input:
-    * output: the folder where to publish output
+    * output: the folder where to publish output (default: output)
+    * max_mnp_distance: maximum MNP distance that will be passed to Mutect2 (default: 0, recommended for creating a PON)
+    * memory_mutect2: the ammount of memory used by Mutect2 (default: 16g)
+    * memory_gather_vcfs: the ammount of memory used by Picard's GatherVcfs (default: 32g)
+    * memory_genomicsdb_import: the ammount of memory used by GenomicsDBImport (default: 16g)
+    * memory_create_pon: the ammount of memory used by CreateSomaticPanelOfNormals (default: 32g)
 
 Output:
-    * Output combined VCF pon.vcf
+    * Output VCF files of single samples and TBI indexes
+    * Output combined VCF `pon.vcf` and index `pon.vcf.idx`
+    * Log files of all processes in `logs/` directory
 """

From ba6b3b3fd52f9b3058250b3353a899966fcec318 Mon Sep 17 00:00:00 2001
From: Julian Thomas Mohr <julian.mohr@tron-mainz.de>
Date: Tue, 26 Mar 2024 13:36:10 +0100
Subject: [PATCH 6/8] Update README regarding new PON pipeline

---
 README.md | 58 +++++++++++++++++++++++++++----------------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index 8eb7a67..c146ed4 100644
--- a/README.md
+++ b/README.md
@@ -138,28 +138,16 @@ java -jar /code/picard/2.21.2/picard.jar LiftoverVcf INPUT=/projects/data/gatk_b
 
 ### Panel Of Normals (PON)
 
-The PON is used to filter out technical artifacts from the somatic variant calls. The PON is recommended to be formed from technically similar samples (ie: same sequencing platform, same sample preparation), from healthy and young individuals and to be formed by a minimum of 40 samples (see https://gatkforums.broadinstitute.org/gatk/discussion/11053/panel-of-normals-pon ).
+The PON is used to filter out technical artifacts from the somatic variant calls. The PON is recommended to be formed from technically similar samples (ie: same sequencing platform, same sample preparation), from healthy and young individuals and to be formed by a minimum of 40 samples (see [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON) for further details).
 
 The normal samples are processed by the BAM preprocessing pipeline including marking duplicates and BQSR.
 
-Run MuTect2 on each normal sample as follows:
+To compute the PON, we implemented a best practice [Nextflow pipeline](panel_of_normals/main.nf) according to the GATK documentation:
 
-```
-java -Xmx16g -jar /code/gatk/4.1.3.0/gatk-package-4.1.3.0-local.jar \
-    Mutect2 \
-    --reference ${params.reference} \
-    --intervals ${interval} \
-    --input ${bam} \
-    --tumor-sample ${name} \
-    --max-mnp-distance 0 \
-    --output ${bam.baseName}.${interval.baseName}.mutect.vcf
-```
-
-Note the parameter "--max-mnp-distance 0" is needed to avoid MNPs being called.
+- [(How to) Call somatic mutations using GATK4 Mutect2](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132--How-to-Call-somatic-mutations-using-GATK4-Mutect2)  
+- [Panel of Normals (PON)](https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON)  
 
-The multiple VCFs need to be combined with the GATK tool "CreateSomaticPanelOfNormals".
-
-This is implemented in the pipeline `mutect2_pon.vcf`.
+Please note that the new best practice workflow according to the GATK documentation (see also [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132--How-to-Call-somatic-mutations-using-GATK4-Mutect2)) now uses GenomicsDB for scalability. Therefore, we included a new step for `GenomicsDBImport` in our pipeline. 
 
 ### Configuring Funcotator
 
@@ -171,25 +159,37 @@ Also, make sure that the reference version provided to funcotator with `--refere
 
 ## How to run the Panel of Normals (PON) pipeline
 
-```
-$ nextflow mutect2_pon.nf --help
-Usage:
-    mutect2_pon.nf --input_files input_files
+```bash
+TRON-Bioinformatics/tronflow-mutect2/panel_of_normals v1.8.0
+Authors: Pablo Riesgo-Ferreiro, Özlem Muslu, Luisa Bresadola, Julian Thomas Mohr
+
+MuTect2 best practices workflow to create a panel of normals
 
-This workflow aims to compute a panel of normals to be used with MuTect2
+Usage:
+    nextflow run main.nf --input_files INPUT_FILES --reference REFERENCE --intervals INTERVALS --gnomad GNOMAD [--output [OUTPUT]] [--max_mnp_distance [MAX_MNP_DISTANCE]] [--memory_mutect2 [MEMORY_MUTECT2]] [--memory_gather_vcfs [MEMORY_GATHER_VCFS]] [--memory_genomicsdb_import [MEMORY_GENOMICSDB_IMPORT]] [--memory_create_pon [MEMORY_CREATE_PON]]
 
 Input:
-    * input_files: the path to a file containing in each row the sample name and the path to a BAM file to be included in the PON
-    	example:
-	sample1	/path/to/sample1.bam
-	sample2	/path/to/sample2.bam
-	NOTE: the sample name must be set in the @SN annotation
+    * input_files: the path to a tab-separated values file containing in each row the sample name and the path to a BAM file to be considered for computing the PON (indexes expected *.bai)
+        example:
+        sample1 /path/to/sample1.bam
+        sample2 /path/to/sample2.bam
+        NOTE: the sample name must be set in the @SM annotation of the BAM header
+    * reference: the path to the FASTA genome reference (indexes expected *.fai, *.dict)
+    * intervals: the path to a BED file containing the genomic regions to analyze
+    * gnomad: the path to the gnomad VCF or other germline resource
 
 Optional input:
-    * output: the folder where to publish output
+    * output: the folder where to publish output (default: output)
+    * max_mnp_distance: maximum MNP distance that will be passed to Mutect2 (default: 0, recommended for creating a PON)
+    * memory_mutect2: the ammount of memory used by Mutect2 (default: 16g)
+    * memory_gather_vcfs: the ammount of memory used by Picard's GatherVcfs (default: 32g)
+    * memory_genomicsdb_import: the ammount of memory used by GenomicsDBImport (default: 16g)
+    * memory_create_pon: the ammount of memory used by CreateSomaticPanelOfNormals (default: 32g)
 
 Output:
-    * Output combined VCF pon.vcf
+    * Output VCF files of single samples and TBI indexes
+    * Output combined VCF `pon.vcf` and index `pon.vcf.idx`
+    * Log files of all processes in `logs/` directory
 ```
 
 ## References

From 97a59f940a94daf176f4dfb8560cfe73d375e72f Mon Sep 17 00:00:00 2001
From: Julian Thomas Mohr <julian.mohr@tron-mainz.de>
Date: Tue, 26 Mar 2024 13:39:58 +0100
Subject: [PATCH 7/8] Add reference to GenomicsDBImport usage and performance
 guidelines to README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c146ed4..0e0e3cb 100644
--- a/README.md
+++ b/README.md
@@ -147,7 +147,7 @@ To compute the PON, we implemented a best practice [Nextflow pipeline](panel_of_
 - [(How to) Call somatic mutations using GATK4 Mutect2](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132--How-to-Call-somatic-mutations-using-GATK4-Mutect2)  
 - [Panel of Normals (PON)](https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON)  
 
-Please note that the new best practice workflow according to the GATK documentation (see also [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132--How-to-Call-somatic-mutations-using-GATK4-Mutect2)) now uses GenomicsDB for scalability. Therefore, we included a new step for `GenomicsDBImport` in our pipeline. 
+Please note that the new best practice workflow according to the GATK documentation (see also [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132--How-to-Call-somatic-mutations-using-GATK4-Mutect2)) now uses GenomicsDB for scalability. Therefore, we included a new step for `GenomicsDBImport` in our pipeline. To improve the performance of this step, we applied the recommendations according to GATK's [GenomicsDBImport usage and performance guidelines](https://gatk.broadinstitute.org/hc/en-us/articles/360056138571-GenomicsDBImport-usage-and-performance-guidelines). 
 
 ### Configuring Funcotator
 

From 4fd55de9a038123a5bed9ac45dfa6aff665a450e Mon Sep 17 00:00:00 2001
From: lkress <35309241+LKress@users.noreply.github.com>
Date: Fri, 14 Jun 2024 11:41:02 +0200
Subject: [PATCH 8/8] Update java version in automated_tests.yml

* Required as nextflow version was built against a non-canonical more recent java version
---
 .github/workflows/automated_tests.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/automated_tests.yml b/.github/workflows/automated_tests.yml
index a38aaff..39e3dc6 100644
--- a/.github/workflows/automated_tests.yml
+++ b/.github/workflows/automated_tests.yml
@@ -8,10 +8,10 @@ jobs:
 
     steps:
     - uses: actions/checkout@v2
-    - uses: actions/setup-java@v2
+    - uses: actions/setup-java@v3
       with:
         distribution: 'zulu' # See 'Supported distributions' for available options
-        java-version: '8'
+        java-version: '11'
     - uses: conda-incubator/setup-miniconda@v2
       with:
         auto-update-conda: true
@@ -28,4 +28,4 @@ jobs:
         key: ${{ runner.os }}-tronflow-mutect2
     - name: Run tests
       run: |
-        make
\ No newline at end of file
+        make