From 0446448f376b655a0a0f0708a63b97b3f9ef08c0 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Fri, 12 Aug 2022 07:35:24 +0200 Subject: [PATCH 1/7] add tentative whatshap implementation --- main.nf | 10 ++++-- modules/05_phasing.nf | 32 +++++++++++++++++++ ...annotation.nf => 06_variant_annotation.nf} | 0 tests/run_test_11.sh | 18 +++++++++++ 4 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 modules/05_phasing.nf rename modules/{05_variant_annotation.nf => 06_variant_annotation.nf} (100%) create mode 100644 tests/run_test_11.sh diff --git a/main.nf b/main.nf index 5480b3c..43edf95 100755 --- a/main.nf +++ b/main.nf @@ -6,7 +6,8 @@ include { FILTER_VCF } from './modules/01_filter' include { BCFTOOLS_NORM; VT_DECOMPOSE_COMPLEX; REMOVE_DUPLICATES } from './modules/02_normalization' include { SUMMARY_VCF; SUMMARY_VCF as SUMMARY_VCF_2 } from './modules/03_summary' include { VAFATOR; MULTIALLELIC_FILTER } from './modules/04_vafator' -include { VARIANT_ANNOTATION } from './modules/05_variant_annotation' +include { WHATSHAP } from './modules/05_phasing' +include { VARIANT_ANNOTATION } from './modules/06_variant_annotation' params.help= false params.input_vcfs = false @@ -22,6 +23,7 @@ params.memory = "4g" params.skip_multiallelic_filter = false params.snpeff_organism = false params.snpeff_datadir = false +params.phasing_sample = false if (params.help) { @@ -91,9 +93,13 @@ workflow { VAFATOR(final_vcfs.join(input_bams.groupTuple())) final_vcfs = VAFATOR.out.annotated_vcf if ( ! params.skip_multiallelic_filter ) { - final_vcfs = MULTIALLELIC_FILTER(final_vcfs) + MULTIALLELIC_FILTER(final_vcfs) final_vcfs = MULTIALLELIC_FILTER.out.filtered_vcf } + if (params.phasing_sample) { + WHATSHAP(final_vcfs.join(input_bams.groupTuple())) + final_vcfs = WHATSHAP.out.phased_vcf + } } if (params.snpeff_organism) { diff --git a/modules/05_phasing.nf b/modules/05_phasing.nf new file mode 100644 index 0000000..6ed6342 --- /dev/null +++ b/modules/05_phasing.nf @@ -0,0 +1,32 @@ +params.phasing_sample = false +params.cpus = 1 +params.memory = '3g' + + +process WHATSHAP { + cpus params.cpus + memory params.memory + tag "${patient_name}" + publishDir "${params.output}/${patient_name}", mode: "copy" + + conda (params.enable_conda ? "bioconda::whatshap=1.4" : null) + + input: + tuple val(patient_name), file(vcf), val(bams) + + output: + tuple val(patient_name), file("${patient_name}.phased.vcf"), emit: phased_vcf + + script: + phasing_bams = bams.toList().stream() + .filter{ b -> b.split(':')[0] == "${params.phasing_sample}" } + .collect{ b -> b.split(":")[1] } + .join(" ") ; + """ + whatshap phase \ + -o ${patient_name}.phased.vcf \ + --reference=${params.reference} \ + ${vcf} \ + ${phasing_bams} + """ +} \ No newline at end of file diff --git a/modules/05_variant_annotation.nf b/modules/06_variant_annotation.nf similarity index 100% rename from modules/05_variant_annotation.nf rename to modules/06_variant_annotation.nf diff --git a/tests/run_test_11.sh b/tests/run_test_11.sh new file mode 100644 index 0000000..d7de96e --- /dev/null +++ b/tests/run_test_11.sh @@ -0,0 +1,18 @@ +#!/bin/bash + + +source tests/assert.sh +output_folder=output/test11 +echo -e "tumor_normal\tprimary:"`pwd`"/test_data/TESTX_S1_L001.bam" > test_data/test_bams.txt +echo -e "tumor_normal\tnormal:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data/test_bams.txt +echo -e "single_sample\ttumor:"`pwd`"/test_data/TESTX_S1_L001.bam" >> test_data/test_bams.txt +echo -e "single_sample\ttumor:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data/test_bams.txt +echo -e "single_sample\tnormal:"`pwd`"/test_data/TESTX_S1_L001.bam" >> test_data/test_bams.txt +echo -e "single_sample\tnormal:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data/test_bams.txt +nextflow main.nf -profile test,conda --output $output_folder --input_bams test_data/test_bams.txt --phasing_sample normal +test -s $output_folder/single_sample/single_sample.filtered_multiallelics.vcf || { echo "Missing test 10 output file!"; exit 1; } +test -s $output_folder/tumor_normal/tumor_normal.filtered_multiallelics.vcf || { echo "Missing test 10 output file!"; exit 1; } +test -s $output_folder/single_sample/single_sample.vaf.vcf || { echo "Missing test 10 output file!"; exit 1; } +test -s $output_folder/single_sample/single_sample.phased.vcf || { echo "Missing test 10 output file!"; exit 1; } +test -s $output_folder/tumor_normal/tumor_normal.vaf.vcf || { echo "Missing test 10 output file!"; exit 1; } +test -s $output_folder/tumor_normal/tumor_normal.phased.vcf || { echo "Missing test 10 output file!"; exit 1; } From 17922140f188f07566509ee4168dd655dc32b1e8 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Thu, 20 Oct 2022 07:17:35 +0200 Subject: [PATCH 2/7] avoids bcftools csq to fail when no phase available --- main.nf | 4 ++-- modules/05_phasing.nf | 1 + modules/06_variant_annotation.nf | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index b7242a5..6c3c616 100755 --- a/main.nf +++ b/main.nf @@ -35,7 +35,7 @@ params.skip_multiallelic_filter = false // SnpEff input params.snpeff_organism = false params.snpeff_datadir = false -params.phasing = false +params.phasing_sample = false if (params.help) { @@ -140,7 +140,7 @@ workflow { final_vcfs = MULTIALLELIC_FILTER(final_vcfs) final_vcfs = MULTIALLELIC_FILTER.out.filtered_vcf } - if (params.phasing) { + if (params.phasing_sample) { WHATSHAP(final_vcfs.join(input_bams.groupTuple())) final_vcfs = WHATSHAP.out.phased_vcf } diff --git a/modules/05_phasing.nf b/modules/05_phasing.nf index 6ed6342..b0713c3 100644 --- a/modules/05_phasing.nf +++ b/modules/05_phasing.nf @@ -24,6 +24,7 @@ process WHATSHAP { .join(" ") ; """ whatshap phase \ + --indel \ -o ${patient_name}.phased.vcf \ --reference=${params.reference} \ ${vcf} \ diff --git a/modules/06_variant_annotation.nf b/modules/06_variant_annotation.nf index b469ae1..c79115e 100644 --- a/modules/06_variant_annotation.nf +++ b/modules/06_variant_annotation.nf @@ -49,6 +49,7 @@ process VARIANT_ANNOTATION_BCFTOOLS { --fasta-ref ${params.reference} \\ --gff-annot ${params.gff} ${vcf} \\ --output-type v \\ - --output ${name}.annotated.vcf + --output ${name}.annotated.vcf \\ + --phase a """ } From b4462dd2f7f5ba6db94c67c0b988d8a7fea3ced0 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Thu, 20 Oct 2022 07:25:18 +0200 Subject: [PATCH 3/7] fix tests --- Makefile | 1 + test/scripts/run_test_14.sh | 26 ++++++++++++-------------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 109fe6e..4c607fe 100644 --- a/Makefile +++ b/Makefile @@ -29,3 +29,4 @@ test: bash test/scripts/run_test_11.sh bash test/scripts/run_test_12.sh bash test/scripts/run_test_13.sh + bash test/scripts/run_test_14.sh diff --git a/test/scripts/run_test_14.sh b/test/scripts/run_test_14.sh index d7de96e..e8857ee 100644 --- a/test/scripts/run_test_14.sh +++ b/test/scripts/run_test_14.sh @@ -1,18 +1,16 @@ #!/bin/bash -source tests/assert.sh -output_folder=output/test11 -echo -e "tumor_normal\tprimary:"`pwd`"/test_data/TESTX_S1_L001.bam" > test_data/test_bams.txt -echo -e "tumor_normal\tnormal:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data/test_bams.txt -echo -e "single_sample\ttumor:"`pwd`"/test_data/TESTX_S1_L001.bam" >> test_data/test_bams.txt -echo -e "single_sample\ttumor:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data/test_bams.txt -echo -e "single_sample\tnormal:"`pwd`"/test_data/TESTX_S1_L001.bam" >> test_data/test_bams.txt -echo -e "single_sample\tnormal:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data/test_bams.txt -nextflow main.nf -profile test,conda --output $output_folder --input_bams test_data/test_bams.txt --phasing_sample normal -test -s $output_folder/single_sample/single_sample.filtered_multiallelics.vcf || { echo "Missing test 10 output file!"; exit 1; } -test -s $output_folder/tumor_normal/tumor_normal.filtered_multiallelics.vcf || { echo "Missing test 10 output file!"; exit 1; } -test -s $output_folder/single_sample/single_sample.vaf.vcf || { echo "Missing test 10 output file!"; exit 1; } +source test/scripts/assert.sh +output_folder=test/output/test14 +echo -e "tumor_normal\tprimary:"`pwd`"/test/data/TESTX_S1_L001.bam" > test/data/test_bams.txt +echo -e "tumor_normal\tnormal:"`pwd`"/test/data/TESTX_S1_L002.bam" >> test/data/test_bams.txt +echo -e "single_sample\ttumor:"`pwd`"/test/data/TESTX_S1_L001.bam" >> test/data/test_bams.txt +echo -e "single_sample\ttumor:"`pwd`"/test/data/TESTX_S1_L002.bam" >> test/data/test_bams.txt +echo -e "single_sample\tnormal:"`pwd`"/test/data/TESTX_S1_L001.bam" >> test/data/test_bams.txt +echo -e "single_sample\tnormal:"`pwd`"/test/data/TESTX_S1_L002.bam" >> test/data/test_bams.txt + +nextflow main.nf -profile test,conda --output $output_folder --input_bams test/data/test_bams.txt \ + --phasing_sample normal --input_vcfs test/data/test_input_single_sample.txt + test -s $output_folder/single_sample/single_sample.phased.vcf || { echo "Missing test 10 output file!"; exit 1; } -test -s $output_folder/tumor_normal/tumor_normal.vaf.vcf || { echo "Missing test 10 output file!"; exit 1; } -test -s $output_folder/tumor_normal/tumor_normal.phased.vcf || { echo "Missing test 10 output file!"; exit 1; } From 148de11722efd5f6e41102dbe2c19ec6f5f03c95 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Thu, 20 Oct 2022 07:25:45 +0200 Subject: [PATCH 4/7] bump version --- nextflow.config | 2 +- test/data/test_input_single_sample.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 test/data/test_input_single_sample.txt diff --git a/nextflow.config b/nextflow.config index bcea7da..be871bf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -27,7 +27,7 @@ env { // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] -VERSION = '2.5.0' +VERSION = '2.6.0' manifest { name = 'TRON-Bioinformatics/tronflow-vcf-postprocessing' diff --git a/test/data/test_input_single_sample.txt b/test/data/test_input_single_sample.txt new file mode 100644 index 0000000..e221c35 --- /dev/null +++ b/test/data/test_input_single_sample.txt @@ -0,0 +1 @@ +single_sample test/data/test_single_sample.vcf \ No newline at end of file From aff0446886a9572e481f78c9633bf705363b6133 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Thu, 20 Oct 2022 07:26:58 +0200 Subject: [PATCH 5/7] bump version --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index be871bf..e2b9a89 100644 --- a/nextflow.config +++ b/nextflow.config @@ -27,7 +27,7 @@ env { // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] -VERSION = '2.6.0' +VERSION = '3.0.0' manifest { name = 'TRON-Bioinformatics/tronflow-vcf-postprocessing' From eb6868a05cdb84a80854ecac8f05f26abb725ae4 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Thu, 20 Oct 2022 07:42:01 +0200 Subject: [PATCH 6/7] document phasing --- README.md | 19 +++++++++++++++++++ nextflow.config | 1 + 2 files changed, 20 insertions(+) diff --git a/README.md b/README.md index ef731d6..1ba516b 100644 --- a/README.md +++ b/README.md @@ -268,6 +268,25 @@ These annotations are provided by VAFator (https://github.com/TRON-Bioinformatic No technical annotations are performed if the parameter `--input_bams` is not passed. +## Phasing with WhatsHap + +The phase of the mutations can be inferred from the reads pileup. +This information is relevant to determine the impact of nearby mutations. +WhatsHap adds the phasing information when possible to the VCF following the VCF specification for that purpose. +Genotypes with phase information use the `|` instead of the `/` and `0|1` and `1|0` represent the two different phases. +A homozygous mutations in phase is represented as `1|1`. +Furthermore, because phasing may be incomplete, to define the different phased blocks the INFO/PS annotation used. +All mutations belonging to the same phase block share the same PS number. + +BAM files need to be provided to perform phasing with `--input_bams` and the phasing sample has to be determined with +`--phasing_sample`. + +This approach has some limitations: +- We only support diploid genotyped VCF files. WhatsHap does support polyploid samples. But the particular case of +somatic mutations is not supported. +- The sample name chosen to perform the phasing has to be the same across all VCFs. +Not to be mistaken with the patient name. Eg: only `normal` samples across all patients can be used for phasing + ## Functional annotations The functional annotations provide a biological context for every variant. Such as the overlapping genes or the effect diff --git a/nextflow.config b/nextflow.config index e2b9a89..d608e48 100644 --- a/nextflow.config +++ b/nextflow.config @@ -87,6 +87,7 @@ Optional input: * --snpeff_memory: for some samples SnpEff may require to use more memory (default: 3g) * --mapping_quality: VAFator minimum mapping quality (default: 0) * --base_call_quality: VAFator minimum base call quality (default: 0) + * --phasing-sample: enables phasing with whatshap and defines the sample BAM to phase the VCF (BAMs need to be provided) Output: * Normalized VCF file From 6f591338edafe8c4570d62fa959aa5c850dfc575 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Thu, 20 Oct 2022 07:42:21 +0200 Subject: [PATCH 7/7] fix test automation --- .github/workflows/automated_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/automated_tests.yml b/.github/workflows/automated_tests.yml index f63adc7..77165c0 100644 --- a/.github/workflows/automated_tests.yml +++ b/.github/workflows/automated_tests.yml @@ -26,4 +26,5 @@ jobs: run: conda config --show-sources - name: Run tests run: | + export NXF_VER=22.04.5 make \ No newline at end of file