-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.nf
360 lines (308 loc) · 14 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
#!/usr/bin/env nextflow
//help function for the tool
def show_help (){
log.info IARC_Header()
log.info tool_header()
log.info"""
Usage:
The typical command for running the pipeline is as follows:
nextflow run iarcbioinfo/purple-nf -singularity [OPTIONS]
Mandatory arguments:
--tn_file [file] File containing list of T/N bam/cram files to be processed
--cohort_dir [dir] directory where the BAM or CRAM file are stored
--ref [file] fasta file of reference genome [hg38.fa], should be indexed [hg38.fa.fai]
--ref_dict [file] dict file for the reference genomep [hg38.dict]
Optional arguments:
--tumor_only [flag] active tumor_only mode
--bam [flag] active bam mode [def:cram]
--output_folder [string] name of output folder
--cpu [Integer] Number of CPUs[def:2]
--mem [Integer] Max memory [def:8Gb]
--somatic_vcfs [file] file containing list of somatic VCF variants with tumor_id and vcf_path, by default assumes that tumor_sample is the second listed sample of the VCF file.
""".stripIndent()
}
//we display help information
if (params.help){ show_help(); exit 0;}
//we display the header of the tool
log.info IARC_Header()
log.info tool_header()
//Check mandatory parameters
assert (params.ref != null) : "please specify --ref reference.fasta"
assert (params.ref_dict != null) : "please specify --ref_dict reference.dict"
assert (params.tn_file != null ) : "please specify --tn_file"
assert (params.cohort_dir != null ) : "please specify --cohort_dir"
//function that read the tumors to process from a tn_file
if(params.tn_file){
def cram = params.bam ? false:true
tn_pairs = parse_tn_file(params.tn_file,params.cohort_dir,cram)
//we duplicate the tn_pairs channel
tn_pairs.into { tn_pairs_cobalt; tn_pairs_amber}
}
//function to read the somatic vcfs
if(params.somatic_vcfs){
svcfs=parse_vcf_file(params.somatic_vcfs)
}else{
svcfs=channel.empty()
}
//chanel for reference genome
ref_fasta = Channel.value(file(params.ref)).ifEmpty{exit 1, "reference file not found: ${params.ref}"}
ref_dict = Channel.value(file(params.ref_dict)).ifEmpty{exit 1, "Dict reference file not found: ${params.ref_dict}"}
ref_fai = Channel.value(file(params.ref+'.fai')).ifEmpty{exit 1, "index file not found: ${params.ref}.fai"}
print_params()
//PATHS in the container for databases
// /hmftools/hg38
// /hmftools/hg38/DiploidRegions.38.bed
// /hmftools/hg38/GC_profile.1000bp.38.cnp
// /hmftools/hg38/GermlineHetPon.38.vcf
//run preprocesing with bcftools
process HQ_VCF{
cpus '1'
memory '1G'
tag "hq_vcf"
publishDir params.output_folder+'/VCF_highconf/', mode: 'copy'
input:
set val(tumor_id), file(vcf) from svcfs
output:
set val(tumor_id), file("${vcf.baseName}_highconf.vcf.gz") into hc_vcfs
when:
params.somatic_vcfs != 'null'
script:
if(params.debug == false){
"""
#we assume that tumor WGS is the second sample in the VCF file
# SNVs variants with a minimum read support of 5 are kept as high quality
bcftools view -Oz -e 'FORMAT/AD[1:0]<5 | TYPE!="snps"' ${vcf} -o ${vcf.baseName}_filter.vcf.gz
#we reheader the vcf file to match purple, cobalt, and amber sample names
zcat ${vcf.baseName}_filter.vcf.gz | egrep "^#CHR" |\
awk -v tid=${tumor_id} '{print \$(NF-1)" "tid"_N"; print \$NF" "tid"_T"}' > rename_sample.txt
bcftools reheader -s rename_sample.txt -o ${vcf.baseName}_highconf.vcf.gz ${vcf.baseName}_filter.vcf.gz
"""
}else{
"""
touch ${vcf.baseName}_highconf.vcf.gz
"""
}
}
process COBALT {
cpus params.cpu
memory params.mem+'G'
publishDir params.output_folder+'/COBALT/', mode: 'copy'
input:
set val(tumor_id), file(tumor), file(tumor_index), file(normal), file(normal_index) from tn_pairs_cobalt
file(ref) from ref_fasta
file(fai) from ref_fai
output:
set val(tumor_id), path("${tumor_id}_COBALT") into cobalt
script:
if(params.debug == false){
if(params.tumor_only){
"""
COBALT -Xms1g -Xmx${params.mem}g -gc_profile /hmftools/hg38/GC_profile.1000bp.38.cnp \\
-ref_genome ${ref} -tumor_only -tumor_only_diploid_bed /hmftools/hg38/DiploidRegions.38.bed \\
-tumor ${tumor_id}_T -tumor_bam ${tumor} -output_dir ${tumor_id}_COBALT -threads ${params.cpu}
"""
}else{
"""
COBALT -Xms1g -Xmx${params.mem}g -gc_profile /hmftools/hg38/GC_profile.1000bp.38.cnp \\
-ref_genome ${ref} -reference ${tumor_id}_N -reference_bam ${normal} \\
-tumor ${tumor_id}_T -tumor_bam ${tumor} -output_dir ${tumor_id}_COBALT -threads ${params.cpu}
"""
}
}else{
//debug code with options
"""
mkdir ${tumor_id}_COBALT
touch ${tumor_id}_COBALT/${tumor_id}.cobalt
"""
}
}
process AMBER {
cpus params.cpu
memory params.mem+'G'
publishDir params.output_folder+'/AMBER/', mode: 'copy'
input:
set val(tumor_id), file(tumor), file(tumor_index), file(normal), file(normal_index) from tn_pairs_amber
file(ref) from ref_fasta
file(fai) from ref_fai
output:
set val(tumor_id), path("${tumor_id}_AMBER") into amber
script:
if(params.debug == false){
if(params.tumor_only){
"""
AMBER -Xms1g -Xmx${params.mem}g -loci /hmftools/hg38/GermlineHetPon.38.vcf -ref_genome ${ref} -tumor_only \\
-tumor ${tumor_id}_T -tumor_bam ${tumor} -output_dir ${tumor_id}_AMBER -threads ${params.cpu}
"""
}else{
"""
AMBER -Xms1g -Xmx${params.mem}g -loci /hmftools/hg38/GermlineHetPon.38.vcf -ref_genome ${ref} \\
-reference ${tumor_id}_N -reference_bam ${normal} \\
-tumor ${tumor_id}_T -tumor_bam ${tumor} -output_dir ${tumor_id}_AMBER -threads ${params.cpu}
"""
}
}else{
"""
mkdir ${tumor_id}_AMBER
touch ${tumor_id}_AMBER/${tumor_id}.amber
"""
}
}
//we merge previous results from amber and cobalt
amber_cobalt=amber.join(cobalt, remainder: true)
//we ask if somatics where given and join them to the amber_cobalt struct
if(params.somatic_vcfs){
amber_cobalt_vcf=amber_cobalt.join(hc_vcfs,remainder:true)
}else{
//when no vcfs files are given
vcf_dump = file('NO_VCF')
amber_cobalt_vcf=amber_cobalt.spread([vcf_dump])
}
//dumping some channels
//amber_cobalt_vcf.into{print_vcfs;amber_cobalt_vcf2}
//print_vcfs.view()
//we have to merge the different inputs
process PURPLE {
cpus params.cpu
memory params.mem+'G'
publishDir params.output_folder+'/PURPLE/', mode: 'copy'
input:
set val(tumor_id), path(amber_dir), path(cobalt_dir), file(hcvcf) from amber_cobalt_vcf
file(ref) from ref_fasta
file(fai) from ref_fai
file(dict) from ref_dict
output:
set val(tumor_id), path("${tumor_id}_PURPLE") into purple
file("${tumor_id}_T.purple.purity.sample.tsv") into stats_purple
script:
def include_vcf_purple = hcvcf.name != 'NO_VCF' ? "-somatic_vcf ${hcvcf}" : ''
if(params.debug == false){
if(params.tumor_only){
"""
PURPLE -Xms1g -Xmx${params.mem}g -tumor_only -tumor ${tumor_id}_T \\
-no_charts \\
-output_dir ${tumor_id}_PURPLE \\
-amber ${amber_dir} \\
-cobalt ${cobalt_dir} \\
-gc_profile /hmftools/hg38/GC_profile.1000bp.38.cnp \\
-threads ${params.cpu} \\
-ref_genome ${ref} ${include_vcf_purple}
awk -v tumor=${tumor_id} '{print tumor"\t"\$0}' ${tumor_id}_PURPLE/${tumor_id}_T.purple.purity.tsv > ${tumor_id}_T.purple.purity.sample.tsv
"""
}else{
"""
PURPLE -Xms1g -Xmx${params.mem}g -reference ${tumor_id}_N -tumor ${tumor_id}_T \\
-no_charts \\
-output_dir ${tumor_id}_PURPLE \\
-amber ${amber_dir} \\
-cobalt ${cobalt_dir} \\
-gc_profile /hmftools/hg38/GC_profile.1000bp.38.cnp \\
-threads ${params.cpu} \\
-ref_genome ${ref} ${include_vcf_purple}
awk -v tumor=${tumor_id} '{print tumor"\t"\$0}' ${tumor_id}_PURPLE/${tumor_id}_T.purple.purity.tsv > ${tumor_id}_T.purple.purity.sample.tsv
"""
}
}else{
"""
echo PURPLE -Xms1g -Xmx${params.mem}g -reference ${tumor_id}_N -tumor ${tumor_id}_T \\
-no_charts \\
-output_dir ${tumor_id}_PURPLE \\
-amber ${amber_dir} \\
-cobalt ${cobalt_dir} \\
-gc_profile /hmftools/hg38/GC_profile.1000bp.38.cnp \\
-threads ${params.cpu} \\
-ref_genome ${ref} ${include_vcf_purple}
mkdir ${tumor_id}_PURPLE
touch ${tumor_id}_T.purple.purity.sample.tsv
"""
}
}
stats_purple.collectFile(name: 'purple_summary.txt', storeDir: params.output_folder, seed: 'tumor_id\tpurity\tnormFactor\tscore\tdiploidProportion\tploidy\tgender\tstatus\tpolyclonalProportion\tminPurity\tmaxPurity\tminPloidy\tmaxPloidy\tminDiploidProportion\tmaxDiploidProportion\tversion\tsomaticPenalty\twholeGenomeDuplication\tmsIndelsPerMb\tmsStatus\ttml\ttmlStatus\ttmbPerMb\ttmbStatus\tsvTumorMutationalBurden\n', newLine: false, skip: 1)
/*
*
* Functions to create channels from TSV or directories containing BAM/CRAM
*
*/
//we read the pairs from tn_file
def parse_tn_file (tn_file,path,cram){
// FOR INPUT AS A TAB DELIMITED FILE
def file_ext = cram ? '.crai':'.bai'
//[sample t[.bam,cram] t[.bai,crai] n[.bam,.cram] n[.bai,.crai]]
def tn_pairs=Channel.fromPath(tn_file)
.splitCsv(header: true, sep: '\t', strip: true)
.map{row -> [ row.tumor_id,
file(path + "/" + row.tumor),
file(path + "/" + row.tumor+file_ext),
file(path + "/" + row.normal),
file(path + "/" + row.normal+file_ext)]}
.ifEmpty{exit 1, "${tn_file} was empty - no tumor/normal supplied" }
//we return the channel
return tn_pairs
}
//we parse the somatics vcf files
def parse_vcf_file (vcf_file){
//[sample_id vcf_path]
def svcfs=Channel.fromPath(vcf_file)
.splitCsv(header: true, sep: '\t', strip: true)
.map{row -> [ row.tumor_id,
file(row.vcf_path)]}
.ifEmpty{exit 1, "${svcfs} was empty - no vcf files supplied" }
//we return the channel
return svcfs
}
// print the calling parameter to the log and a log file
def print_params () {
//software versions for v2.0
def software_versions = ['hmftools-cobalt' : '1.11',
'hmftools-amber' : '2.52',
'hmftools-purple' : '3.5' ]
//we print the parameters
log.info "\n"
log.info "-\033[2m------------------Calling PARAMETERS--------------------\033[0m-"
log.info params.collect{ k,v -> "${k.padRight(18)}: $v"}.join("\n")
log.info "-\033[2m--------------------------------------------------------\033[0m-"
log.info "\n"
log.info "-\033[2m------------------Software versions--------------------\033[0m-"
log.info software_versions.collect{ k,v -> "${k.padRight(18)}: $v"}.join("\n")
log.info "-\033[2m--------------------------------------------------------\033[0m-"
log.info "\n"
//we print the parameters to a log file
def output_d = new File("${params.output_folder}/nf-pipeline_info/")
if (!output_d.exists()) {
output_d.mkdirs()
}
def output_tf = new File(output_d, "run_parameters_report.txt")
def report_params="------------------Calling PARAMETERS--------------------\n"
report_params+= params.collect{ k,v -> "${k.padRight(18)}: $v"}.join("\n")
report_params+="\n--------------------------------------------------------\n"
report_params+="\n------------------NEXTFLOW Metadata--------------------\n"
report_params+="nextflow version : "+nextflow.version+"\n"
report_params+="nextflow build : "+nextflow.build+"\n"
report_params+="Command line : \n"+workflow.commandLine.split(" ").join(" \\\n")
report_params+="\n--------------------------------------------------------\n"
report_params+="-----------------Software versions--------------------\n"
report_params+=software_versions.collect{ k,v -> "${k.padRight(18)}: $v"}.join("\n")
report_params+="\n--------------------------------------------------------\n"
output_tf.withWriter { w -> w << report_params}
}
//this use ANSI colors to make a short tool description
//useful url: http://www.lihaoyi.com/post/BuildyourownCommandLinewithANSIescapecodes.html
def tool_header (){
return """
PURPLE: Somatic CNV caller (${workflow.manifest.version})
"""
}
//header for the IARC tools
// the logo was generated using the following page
// http://patorjk.com/software/taag (ANSI logo generator)
def IARC_Header (){
return """
#################################################################################
# ██╗ █████╗ ██████╗ ██████╗██████╗ ██╗ ██████╗ ██╗███╗ ██╗███████╗ ██████╗ #
# ██║██╔══██╗██╔══██╗██╔════╝██╔══██╗██║██╔═══██╗██║████╗ ██║██╔════╝██╔═══██╗ #
# ██║███████║██████╔╝██║ ██████╔╝██║██║ ██║██║██╔██╗ ██║█████╗ ██║ ██║ #
# ██║██╔══██║██╔══██╗██║ ██╔══██╗██║██║ ██║██║██║╚██╗██║██╔══╝ ██║ ██║ #
# ██║██║ ██║██║ ██║╚██████╗██████╔╝██║╚██████╔╝██║██║ ╚████║██║ ╚██████╔╝ #
# ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═════╝╚═════╝ ╚═╝ ╚═════╝ ╚═╝╚═╝ ╚═══╝╚═╝ ╚═════╝ #
# Nextflow pipelines for cancer genomics.########################################
"""
}