Skip to content

Commit

Permalink
Merge pull request #13 from TRON-Bioinformatics/whatshap
Browse files Browse the repository at this point in the history
Integrate whatshap for phasing and fix bcftools csq
  • Loading branch information
priesgo authored Oct 20, 2022
2 parents 6a71584 + 6f59133 commit 2f5928f
Show file tree
Hide file tree
Showing 9 changed files with 84 additions and 3 deletions.
1 change: 1 addition & 0 deletions .github/workflows/automated_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ jobs:
run: conda config --show-sources
- name: Run tests
run: |
export NXF_VER=22.04.5
make
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ test:
bash test/scripts/run_test_11.sh
bash test/scripts/run_test_12.sh
bash test/scripts/run_test_13.sh
bash test/scripts/run_test_14.sh
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,25 @@ These annotations are provided by VAFator (https://github.com/TRON-Bioinformatic

No technical annotations are performed if the parameter `--input_bams` is not passed.

## Phasing with WhatsHap

The phase of the mutations can be inferred from the reads pileup.
This information is relevant to determine the impact of nearby mutations.
WhatsHap adds the phasing information when possible to the VCF following the VCF specification for that purpose.
Genotypes with phase information use the `|` instead of the `/` and `0|1` and `1|0` represent the two different phases.
A homozygous mutations in phase is represented as `1|1`.
Furthermore, because phasing may be incomplete, to define the different phased blocks the INFO/PS annotation used.
All mutations belonging to the same phase block share the same PS number.

BAM files need to be provided to perform phasing with `--input_bams` and the phasing sample has to be determined with
`--phasing_sample`.

This approach has some limitations:
- We only support diploid genotyped VCF files. WhatsHap does support polyploid samples. But the particular case of
somatic mutations is not supported.
- The sample name chosen to perform the phasing has to be the same across all VCFs.
Not to be mistaken with the patient name. Eg: only `normal` samples across all patients can be used for phasing

## Functional annotations

The functional annotations provide a biological context for every variant. Such as the overlapping genes or the effect
Expand Down
10 changes: 9 additions & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@ include { FILTER_VCF } from './modules/01_filter'
include { BCFTOOLS_NORM; VT_DECOMPOSE_COMPLEX; REMOVE_DUPLICATES } from './modules/02_normalization'
include { SUMMARY_VCF; SUMMARY_VCF as SUMMARY_VCF_2 } from './modules/03_summary'
include { VAFATOR; MULTIALLELIC_FILTER } from './modules/04_vafator'
include { VARIANT_ANNOTATION_SNPEFF; VARIANT_ANNOTATION_BCFTOOLS } from './modules/05_variant_annotation'
include { WHATSHAP } from './modules/05_phasing'
include { VARIANT_ANNOTATION_SNPEFF; VARIANT_ANNOTATION_BCFTOOLS } from './modules/06_variant_annotation'


params.help= false
params.input_vcfs = false
params.input_bams = false
params.input_vcf = false

// optional VAFator inputs
Expand All @@ -32,6 +35,7 @@ params.skip_multiallelic_filter = false
// SnpEff input
params.snpeff_organism = false
params.snpeff_datadir = false
params.phasing_sample = false


if (params.help) {
Expand Down Expand Up @@ -136,6 +140,10 @@ workflow {
final_vcfs = MULTIALLELIC_FILTER(final_vcfs)
final_vcfs = MULTIALLELIC_FILTER.out.filtered_vcf
}
if (params.phasing_sample) {
WHATSHAP(final_vcfs.join(input_bams.groupTuple()))
final_vcfs = WHATSHAP.out.phased_vcf
}
}

if (params.snpeff_organism) {
Expand Down
33 changes: 33 additions & 0 deletions modules/05_phasing.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
params.phasing_sample = false
params.cpus = 1
params.memory = '3g'


process WHATSHAP {
cpus params.cpus
memory params.memory
tag "${patient_name}"
publishDir "${params.output}/${patient_name}", mode: "copy"

conda (params.enable_conda ? "bioconda::whatshap=1.4" : null)

input:
tuple val(patient_name), file(vcf), val(bams)

output:
tuple val(patient_name), file("${patient_name}.phased.vcf"), emit: phased_vcf

script:
phasing_bams = bams.toList().stream()
.filter{ b -> b.split(':')[0] == "${params.phasing_sample}" }
.collect{ b -> b.split(":")[1] }
.join(" ") ;
"""
whatshap phase \
--indel \
-o ${patient_name}.phased.vcf \
--reference=${params.reference} \
${vcf} \
${phasing_bams}
"""
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ process VARIANT_ANNOTATION_BCFTOOLS {
--fasta-ref ${params.reference} \\
--gff-annot ${params.gff} ${vcf} \\
--output-type v \\
--output ${name}.annotated.vcf
--output ${name}.annotated.vcf \\
--phase a
"""
}
3 changes: 2 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ env {
// Capture exit codes from upstream processes when piping
process.shell = ['/bin/bash', '-euo', 'pipefail']

VERSION = '2.5.0'
VERSION = '3.0.0'

manifest {
name = 'TRON-Bioinformatics/tronflow-vcf-postprocessing'
Expand Down Expand Up @@ -87,6 +87,7 @@ Optional input:
* --snpeff_memory: for some samples SnpEff may require to use more memory (default: 3g)
* --mapping_quality: VAFator minimum mapping quality (default: 0)
* --base_call_quality: VAFator minimum base call quality (default: 0)
* --phasing-sample: enables phasing with whatshap and defines the sample BAM to phase the VCF (BAMs need to be provided)
Output:
* Normalized VCF file
Expand Down
1 change: 1 addition & 0 deletions test/data/test_input_single_sample.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
single_sample test/data/test_single_sample.vcf
16 changes: 16 additions & 0 deletions test/scripts/run_test_14.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash


source test/scripts/assert.sh
output_folder=test/output/test14
echo -e "tumor_normal\tprimary:"`pwd`"/test/data/TESTX_S1_L001.bam" > test/data/test_bams.txt
echo -e "tumor_normal\tnormal:"`pwd`"/test/data/TESTX_S1_L002.bam" >> test/data/test_bams.txt
echo -e "single_sample\ttumor:"`pwd`"/test/data/TESTX_S1_L001.bam" >> test/data/test_bams.txt
echo -e "single_sample\ttumor:"`pwd`"/test/data/TESTX_S1_L002.bam" >> test/data/test_bams.txt
echo -e "single_sample\tnormal:"`pwd`"/test/data/TESTX_S1_L001.bam" >> test/data/test_bams.txt
echo -e "single_sample\tnormal:"`pwd`"/test/data/TESTX_S1_L002.bam" >> test/data/test_bams.txt

nextflow main.nf -profile test,conda --output $output_folder --input_bams test/data/test_bams.txt \
--phasing_sample normal --input_vcfs test/data/test_input_single_sample.txt

test -s $output_folder/single_sample/single_sample.phased.vcf || { echo "Missing test 10 output file!"; exit 1; }

0 comments on commit 2f5928f

Please sign in to comment.