diff --git a/NEWS b/NEWS index f3e719aa..0ac9160e 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,11 @@ Changes affecting specific commands: +* bcftools consensus + + - Add new --regions-overlap option which allows to take into account overlapping deletions + that start out of the fasta file target region. + * bcftools norm - Change the order of atomization and multiallelic splitting (when both -a,-m are given) diff --git a/consensus.c b/consensus.c index 2b58670c..66a65b32 100644 --- a/consensus.c +++ b/consensus.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2014-2023 Genome Research Ltd. + Copyright (c) 2014-2024 Genome Research Ltd. Author: Petr Danecek @@ -118,7 +118,7 @@ typedef struct char **argv; int argc, output_iupac, iupac_GTs, haplotype, allele, isample, napplied; uint8_t *iupac_bitmask, *iupac_als; - int miupac_bitmask, miupac_als; + int miupac_bitmask, miupac_als, regions_overlap; char *fname, *ref_fname, *sample, *sample_fname, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele; char mark_del, mark_ins, mark_snv; smpl_ilist_t *smpl; @@ -348,20 +348,28 @@ static void init_region(args_t *args, char *line) args->prev_base_pos = -1; args->fa_buf.l = 0; args->fa_length = 0; - args->fa_end_pos = to; - args->fa_ori_pos = from; - args->fa_src_pos = from; + args->fa_end_pos = to; // 0-based + args->fa_ori_pos = from; // 0-based + args->fa_src_pos = from; // 0-based args->fa_mod_off = 0; args->fa_frz_pos = -1; args->fa_frz_mod = -1; args->fa_case = -1; args->vcf_rbuf.n = 0; + + // bcf_sr_set_regions accepts 1-based coordinates kstring_t str = {0,0,0}; - if ( from==0 ) from = 1; - if ( to==0 ) to = HTS_POS_MAX; + if ( !from ) from = 1; + else from++; +#ifndef MAX_CSI_COOR +#define MAX_CSI_COOR ((1LL << (14 + 30)) - 1) +#endif + if ( to==0 ) to = MAX_CSI_COOR - 1; + else to++; ksprintf(&str,"%s:%"PRIhts_pos"-%"PRIhts_pos,line,from,to); - bcf_sr_set_regions(args->files,line,0); + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap); + bcf_sr_set_regions(args->files,str.s,0); free(str.s); if ( tmp_ptr ) *tmp_ptr = tmp; @@ -790,15 +798,42 @@ static void apply_variant(args_t *args, bcf1_t *rec) } + char *ref_allele = rec->d.allele[0]; char *alt_allele = rec->d.allele[ialt]; int rmme_alt = 0; int len_diff = 0, alen = 0; - int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; + int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; // position of the variant within the modified fasta sequence if ( idx<0 ) { - fprintf(stderr,"Warning: ignoring overlapping variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - return; + if ( alt_allele[0]=='<' ) // symbolic allele + { + rec->pos -= idx + 1; + rec->rlen += idx + 1; + idx = -1; + } + else if ( strlen(ref_allele) < -idx ) // the ref allele is shorter but overlaps the fa sequence? This should never happen + { + assert(0); + fprintf(stderr,"Warning: ignoring overlapping variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + return; + } + else if ( strlen(alt_allele) > -idx ) // the ref allele overlaps the fa and so does the alt allele + { + rec->pos -= idx; + rec->rlen += idx; + ref_allele -= idx; + alt_allele -= idx; + idx = 0; + } + else // the ref allele overlaps the fa but alt allele does not: trim to leave one base before + { + rec->pos -= idx + 1; + rec->rlen += idx + 1; + ref_allele -= idx + 1; + alt_allele += strlen(alt_allele) - 1; + idx = -1; + } } if ( rec->rlen > args->fa_buf.l - idx ) { @@ -813,7 +848,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) } } } - if ( idx>=args->fa_buf.l ) + if ( idx>0 && idx>=args->fa_buf.l ) error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off); // sanity check the reference base @@ -828,7 +863,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( !strcasecmp(alt_allele,"") ) { static int multibase_ref_del_warned = 0; - if ( rec->d.allele[0][1]!=0 && !multibase_ref_del_warned ) + if ( ref_allele[1]!=0 && !multibase_ref_del_warned ) { fprintf(stderr, "Warning: one REF base is expected with , assuming the actual deletion starts at POS+1 at %s:%"PRId64".\n" @@ -837,7 +872,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) } if ( args->mark_del ) // insert dashes instead of delete sequence { - alt_allele = mark_del(rec->d.allele[0], rec->rlen, NULL, args->mark_del); + alt_allele = mark_del(ref_allele, rec->rlen, NULL, args->mark_del); alen = rec->rlen; len_diff = 0; rmme_alt = 1; @@ -845,7 +880,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) else { len_diff = 1-rec->rlen; - alt_allele = rec->d.allele[0]; // according to VCF spec, the first REF base must precede the event + alt_allele = ref_allele; // according to VCF spec, the first REF base must precede the event alen = 1; } } @@ -856,7 +891,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) return; } } - else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) ) + else if ( idx>=0 && strncasecmp(ref_allele,args->fa_buf.s+idx,rec->rlen) ) { // This is hacky, handle a special case: if SNP or an insert follows a deletion (AAC>A, C>CAA), // the reference base in fa_buf is lost and the check fails. We do not keep a buffer @@ -864,10 +899,10 @@ static void apply_variant(args_t *args, bcf1_t *rec) // one base overlap int fail = 1; - if ( args->prev_base_pos==rec->pos && toupper(rec->d.allele[0][0])==toupper(args->prev_base) ) + if ( args->prev_base_pos==rec->pos && toupper(ref_allele[0])==toupper(args->prev_base) ) { if ( rec->rlen==1 ) fail = 0; - else if ( !strncasecmp(rec->d.allele[0]+1,args->fa_buf.s+idx+1,rec->rlen-1) ) fail = 0; + else if ( !strncasecmp(ref_allele+1,args->fa_buf.s+idx+1,rec->rlen-1) ) fail = 0; } if ( fail ) @@ -883,7 +918,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) " REF .vcf: [%s]\n" " ALT .vcf: [%s]\n" " REF .fa : [%s]%c%s\n", - bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], alt_allele, args->fa_buf.s+idx, + bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, ref_allele, alt_allele, args->fa_buf.s+idx, tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:"" ); } @@ -892,7 +927,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( args->mark_del && len_diff<0 ) { - alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del); + alt_allele = mark_del(ref_allele, rec->rlen, alt_allele, args->mark_del); alen = rec->rlen; len_diff = 0; rmme_alt = 1; @@ -905,23 +940,24 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( args->mark_del && len_diff<0 ) { - alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del); + alt_allele = mark_del(ref_allele, rec->rlen, alt_allele, args->mark_del); alen = rec->rlen; len_diff = 0; rmme_alt = 1; } } - args->fa_case = toupper(args->fa_buf.s[idx])==args->fa_buf.s[idx] ? TO_UPPER : TO_LOWER; + int safe_idx = idx<0 ? 0 : idx; // idx can be negative in case of overlapping deletion + args->fa_case = toupper(args->fa_buf.s[safe_idx])==args->fa_buf.s[safe_idx] ? TO_UPPER : TO_LOWER; if ( args->fa_case==TO_UPPER ) for (i=0; imark_ins && len_diff>0 ) - mark_ins(rec->d.allele[0], alt_allele, args->mark_ins); + mark_ins(ref_allele, alt_allele, args->mark_ins); if ( args->mark_snv ) - mark_snv(rec->d.allele[0], alt_allele, args->mark_snv); + mark_snv(ref_allele, alt_allele, args->mark_snv); if ( len_diff <= 0 ) { @@ -953,7 +989,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) // 1 C T // 1 C CAA int ibeg = 0; - while ( ibegd.allele[0][ibeg]==alt_allele[ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++; + while ( ibegpos + ibeg <= args->prev_base_pos ) ibeg++; for (i=ibeg; ifa_buf.s[idx+i] = alt_allele[i]; @@ -962,7 +998,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) if (args->chain && len_diff != 0) { // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant) - if ( strncasecmp(rec->d.allele[0],alt_allele,1) == 0) + if ( strncasecmp(ref_allele,alt_allele,1) == 0) { // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1); @@ -1135,6 +1171,7 @@ static void usage(args_t *args) fprintf(stderr, " -M, --missing CHAR Output CHAR instead of skipping a missing genotype \"./.\"\n"); fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); fprintf(stderr, " -p, --prefix STRING Prefix to add to output sequence names\n"); + fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " -s, --samples LIST Comma-separated list of samples to include, \"-\" to ignore samples and use REF,ALT\n"); fprintf(stderr, " -S, --samples-file FILE File of samples to include\n"); fprintf(stderr, "Examples:\n"); @@ -1151,6 +1188,7 @@ int main_consensus(int argc, char *argv[]) { args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; + args->regions_overlap = 1; static struct option loptions[] = { @@ -1172,6 +1210,7 @@ int main_consensus(int argc, char *argv[]) {"absent",1,0,'a'}, {"chain",1,0,'c'}, {"prefix",required_argument,0,'p'}, + {"regions-overlap",required_argument,0,5}, {0,0,0,0} }; int c; @@ -1192,6 +1231,10 @@ int main_consensus(int argc, char *argv[]) else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_snv = optarg[0]; else error("The argument is not recognised: --mark-snv %s\n",optarg); break; + case 5 : + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); + break; case 'p': args->chr_prefix = optarg; break; case 's': args->sample = optarg; break; case 'S': args->sample_fname = optarg; break; diff --git a/test/consensus.22.fa b/test/consensus.22.fa new file mode 100644 index 00000000..e506efea --- /dev/null +++ b/test/consensus.22.fa @@ -0,0 +1,10 @@ +>1:11-19 +ACGTACGT +>2:11-19 +ACGTACGT +>3:11-19 +ACGTACGT +>4:11-19 +ACGTACGT +>5:11-19 +ACGTACGT diff --git a/test/consensus.22.vcf b/test/consensus.22.vcf new file mode 100644 index 00000000..5b4c2fd8 --- /dev/null +++ b/test/consensus.22.vcf @@ -0,0 +1,20 @@ +##fileformat=VCFv4.2 +##reference=file://some/path/human_g1k_v37.fasta +##contig= +##contig= +##contig= +##contig= +##contig= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a +1 8 . ACGA A . . . GT 1/1 +1 12 . C A . . . GT 1/1 +2 8 . ACGA TTTT . . . GT 1/1 +2 12 . C A . . . GT 1/1 +3 10 . A T . . . GT 1/1 +3 12 . C A . . . GT 1/1 +4 8 . A . . END=11 GT 1/1 +4 12 . C A . . . GT 1/1 +5 8 . ACGA ACG . . . GT 1/1 +5 12 . C A . . . GT 1/1 diff --git a/test/consensus22.1.out b/test/consensus22.1.out new file mode 100644 index 00000000..34612499 --- /dev/null +++ b/test/consensus22.1.out @@ -0,0 +1,10 @@ +>1:11-19 +AAGTACGT +>2:11-19 +AAGTACGT +>3:11-19 +AAGTACGT +>4:11-19 +AAGTACGT +>5:11-19 +AAGTACGT diff --git a/test/consensus22.2.out b/test/consensus22.2.out new file mode 100644 index 00000000..164dd9c0 --- /dev/null +++ b/test/consensus22.2.out @@ -0,0 +1,10 @@ +>1:11-19 +.AGTACGT +>2:11-19 +TAGTACGT +>3:11-19 +AAGTACGT +>4:11-19 +.AGTACGT +>5:11-19 +.AGTACGT diff --git a/test/consensus22.3.out b/test/consensus22.3.out new file mode 100644 index 00000000..2eaa731b --- /dev/null +++ b/test/consensus22.3.out @@ -0,0 +1,10 @@ +>1:11-19 +AGTACGT +>2:11-19 +TAGTACGT +>3:11-19 +AAGTACGT +>4:11-19 +AGTACGT +>5:11-19 +AGTACGT diff --git a/test/test.pl b/test/test.pl index c998b6d8..52a423f9 100755 --- a/test/test.pl +++ b/test/test.pl @@ -880,6 +880,12 @@ run_test(\&test_vcf_consensus,$opts,in=>'consensus.20',out=>'consensus20.3.out',fa=>'consensus.20.fa',args=>'-M . -s b'); run_test(\&test_vcf_consensus,$opts,in=>'consensus.20',out=>'consensus20.4.out',fa=>'consensus.20.fa',args=>'-M . -s a'); run_test(\&test_vcf_consensus,$opts,in=>'consensus.21',out=>'consensus21.1.out',fa=>'consensus.21.fa',args=>''); +run_test(\&test_vcf_consensus,$opts,in=>'consensus.22',out=>'consensus22.1.out',fa=>'consensus.22.fa',args=>'--regions-overlap 0 --mark-del .'); +run_test(\&test_vcf_consensus,$opts,in=>'consensus.22',out=>'consensus22.2.out',fa=>'consensus.22.fa',args=>'--regions-overlap 1 --mark-del .'); +run_test(\&test_vcf_consensus,$opts,in=>'consensus.22',out=>'consensus22.2.out',fa=>'consensus.22.fa',args=>'--regions-overlap 2 --mark-del .'); +run_test(\&test_vcf_consensus,$opts,in=>'consensus.22',out=>'consensus22.1.out',fa=>'consensus.22.fa',args=>'--regions-overlap 0'); +run_test(\&test_vcf_consensus,$opts,in=>'consensus.22',out=>'consensus22.3.out',fa=>'consensus.22.fa',args=>'--regions-overlap 1'); +run_test(\&test_vcf_consensus,$opts,in=>'consensus.22',out=>'consensus22.3.out',fa=>'consensus.22.fa',args=>'--regions-overlap 2'); run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.1.out',args=>q[-r17:100-150],test_list=>1); run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.2.out',args=>q[-a DP,DV -r17:100-600]); # test files from samtools mpileup test suite run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1)],out=>'mpileup/mpileup.3.out',args=>q[-B --ff 0x14 -r17:1050-1060]); # test file converted to vcf from samtools mpileup test suite