diff --git a/.github/workflows/automated_tests.yml b/.github/workflows/automated_tests.yml index f63adc7..77165c0 100644 --- a/.github/workflows/automated_tests.yml +++ b/.github/workflows/automated_tests.yml @@ -26,4 +26,5 @@ jobs: run: conda config --show-sources - name: Run tests run: | + export NXF_VER=22.04.5 make \ No newline at end of file diff --git a/Makefile b/Makefile index 109fe6e..4c607fe 100644 --- a/Makefile +++ b/Makefile @@ -29,3 +29,4 @@ test: bash test/scripts/run_test_11.sh bash test/scripts/run_test_12.sh bash test/scripts/run_test_13.sh + bash test/scripts/run_test_14.sh diff --git a/README.md b/README.md index ef731d6..1ba516b 100644 --- a/README.md +++ b/README.md @@ -268,6 +268,25 @@ These annotations are provided by VAFator (https://github.com/TRON-Bioinformatic No technical annotations are performed if the parameter `--input_bams` is not passed. +## Phasing with WhatsHap + +The phase of the mutations can be inferred from the reads pileup. +This information is relevant to determine the impact of nearby mutations. +WhatsHap adds the phasing information when possible to the VCF following the VCF specification for that purpose. +Genotypes with phase information use the `|` instead of the `/` and `0|1` and `1|0` represent the two different phases. +A homozygous mutations in phase is represented as `1|1`. +Furthermore, because phasing may be incomplete, to define the different phased blocks the INFO/PS annotation used. +All mutations belonging to the same phase block share the same PS number. + +BAM files need to be provided to perform phasing with `--input_bams` and the phasing sample has to be determined with +`--phasing_sample`. + +This approach has some limitations: +- We only support diploid genotyped VCF files. WhatsHap does support polyploid samples. But the particular case of +somatic mutations is not supported. +- The sample name chosen to perform the phasing has to be the same across all VCFs. +Not to be mistaken with the patient name. Eg: only `normal` samples across all patients can be used for phasing + ## Functional annotations The functional annotations provide a biological context for every variant. Such as the overlapping genes or the effect diff --git a/main.nf b/main.nf index 92497f7..6c3c616 100755 --- a/main.nf +++ b/main.nf @@ -6,10 +6,13 @@ include { FILTER_VCF } from './modules/01_filter' include { BCFTOOLS_NORM; VT_DECOMPOSE_COMPLEX; REMOVE_DUPLICATES } from './modules/02_normalization' include { SUMMARY_VCF; SUMMARY_VCF as SUMMARY_VCF_2 } from './modules/03_summary' include { VAFATOR; MULTIALLELIC_FILTER } from './modules/04_vafator' -include { VARIANT_ANNOTATION_SNPEFF; VARIANT_ANNOTATION_BCFTOOLS } from './modules/05_variant_annotation' +include { WHATSHAP } from './modules/05_phasing' +include { VARIANT_ANNOTATION_SNPEFF; VARIANT_ANNOTATION_BCFTOOLS } from './modules/06_variant_annotation' + params.help= false params.input_vcfs = false +params.input_bams = false params.input_vcf = false // optional VAFator inputs @@ -32,6 +35,7 @@ params.skip_multiallelic_filter = false // SnpEff input params.snpeff_organism = false params.snpeff_datadir = false +params.phasing_sample = false if (params.help) { @@ -136,6 +140,10 @@ workflow { final_vcfs = MULTIALLELIC_FILTER(final_vcfs) final_vcfs = MULTIALLELIC_FILTER.out.filtered_vcf } + if (params.phasing_sample) { + WHATSHAP(final_vcfs.join(input_bams.groupTuple())) + final_vcfs = WHATSHAP.out.phased_vcf + } } if (params.snpeff_organism) { diff --git a/modules/05_phasing.nf b/modules/05_phasing.nf new file mode 100644 index 0000000..b0713c3 --- /dev/null +++ b/modules/05_phasing.nf @@ -0,0 +1,33 @@ +params.phasing_sample = false +params.cpus = 1 +params.memory = '3g' + + +process WHATSHAP { + cpus params.cpus + memory params.memory + tag "${patient_name}" + publishDir "${params.output}/${patient_name}", mode: "copy" + + conda (params.enable_conda ? "bioconda::whatshap=1.4" : null) + + input: + tuple val(patient_name), file(vcf), val(bams) + + output: + tuple val(patient_name), file("${patient_name}.phased.vcf"), emit: phased_vcf + + script: + phasing_bams = bams.toList().stream() + .filter{ b -> b.split(':')[0] == "${params.phasing_sample}" } + .collect{ b -> b.split(":")[1] } + .join(" ") ; + """ + whatshap phase \ + --indel \ + -o ${patient_name}.phased.vcf \ + --reference=${params.reference} \ + ${vcf} \ + ${phasing_bams} + """ +} \ No newline at end of file diff --git a/modules/05_variant_annotation.nf b/modules/06_variant_annotation.nf similarity index 95% rename from modules/05_variant_annotation.nf rename to modules/06_variant_annotation.nf index b469ae1..c79115e 100644 --- a/modules/05_variant_annotation.nf +++ b/modules/06_variant_annotation.nf @@ -49,6 +49,7 @@ process VARIANT_ANNOTATION_BCFTOOLS { --fasta-ref ${params.reference} \\ --gff-annot ${params.gff} ${vcf} \\ --output-type v \\ - --output ${name}.annotated.vcf + --output ${name}.annotated.vcf \\ + --phase a """ } diff --git a/nextflow.config b/nextflow.config index bcea7da..d608e48 100644 --- a/nextflow.config +++ b/nextflow.config @@ -27,7 +27,7 @@ env { // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] -VERSION = '2.5.0' +VERSION = '3.0.0' manifest { name = 'TRON-Bioinformatics/tronflow-vcf-postprocessing' @@ -87,6 +87,7 @@ Optional input: * --snpeff_memory: for some samples SnpEff may require to use more memory (default: 3g) * --mapping_quality: VAFator minimum mapping quality (default: 0) * --base_call_quality: VAFator minimum base call quality (default: 0) + * --phasing-sample: enables phasing with whatshap and defines the sample BAM to phase the VCF (BAMs need to be provided) Output: * Normalized VCF file diff --git a/test/data/test_input_single_sample.txt b/test/data/test_input_single_sample.txt new file mode 100644 index 0000000..e221c35 --- /dev/null +++ b/test/data/test_input_single_sample.txt @@ -0,0 +1 @@ +single_sample test/data/test_single_sample.vcf \ No newline at end of file diff --git a/test/scripts/run_test_14.sh b/test/scripts/run_test_14.sh new file mode 100644 index 0000000..e8857ee --- /dev/null +++ b/test/scripts/run_test_14.sh @@ -0,0 +1,16 @@ +#!/bin/bash + + +source test/scripts/assert.sh +output_folder=test/output/test14 +echo -e "tumor_normal\tprimary:"`pwd`"/test/data/TESTX_S1_L001.bam" > test/data/test_bams.txt +echo -e "tumor_normal\tnormal:"`pwd`"/test/data/TESTX_S1_L002.bam" >> test/data/test_bams.txt +echo -e "single_sample\ttumor:"`pwd`"/test/data/TESTX_S1_L001.bam" >> test/data/test_bams.txt +echo -e "single_sample\ttumor:"`pwd`"/test/data/TESTX_S1_L002.bam" >> test/data/test_bams.txt +echo -e "single_sample\tnormal:"`pwd`"/test/data/TESTX_S1_L001.bam" >> test/data/test_bams.txt +echo -e "single_sample\tnormal:"`pwd`"/test/data/TESTX_S1_L002.bam" >> test/data/test_bams.txt + +nextflow main.nf -profile test,conda --output $output_folder --input_bams test/data/test_bams.txt \ + --phasing_sample normal --input_vcfs test/data/test_input_single_sample.txt + +test -s $output_folder/single_sample/single_sample.phased.vcf || { echo "Missing test 10 output file!"; exit 1; }