From 3f89c0ae16265d7cb29e38ca9337ea11b3626e50 Mon Sep 17 00:00:00 2001 From: vasudeva8 Date: Wed, 18 Dec 2024 18:23:36 +0000 Subject: [PATCH] review updates --- htslib/vcf.h | 128 ++----------------------------------- vcf.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 168 insertions(+), 133 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index 4daf6b1ee..778a25ecb 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -2,7 +2,7 @@ /// High-level VCF/BCF variant calling file operations. /* Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2020, 2022-2023 Genome Research Ltd. + Copyright (C) 2012-2020, 2022-2025 Genome Research Ltd. Author: Heng Li @@ -1501,63 +1501,9 @@ static inline int bcf_float_is_vector_end(float f) return u.i==bcf_float_vector_end ? 1 : 0; } -typedef enum bcf_version {v41 = 1, v42, v43, v44} bcf_version; -/** - * bcf_get_version - get the version as bcf_version enumeration - * @param hdr - bcf header, to get version - * @param ipver - pointer to return version - * Returns 0 on success and -1 on failure - */ -static inline int bcf_get_version(const bcf_hdr_t *hdr, bcf_version *ver) -{ - const char *version = NULL; - - if (!hdr || !ver) { - return -1; - } - - version = bcf_hdr_get_version(hdr); - if (!strcmp("VCFv4.1", version)) { - *ver = v41; - } else if (!strcmp("VCFv4.2", version)) { - *ver = v42; - } else if (!strcmp("VCFv4.3", version)) { - *ver = v43; - } else { - *ver = v44; - } - return 0; -} - -static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) -{ - uint32_t e = 0; - #define BRANCH(type_t, convert, missing, vector_end) { \ - uint8_t *ptr = fmt->p + isample*fmt->size; \ - int i; \ - for (i=0; in; i++, ptr += sizeof(type_t)) \ - { \ - type_t val = convert(ptr); \ - if ( val == vector_end ) break; \ - if ( i ) e |= kputc("/|"[val&1], str) < 0; \ - if ( !(val>>1) ) e |= kputc('.', str) < 0; \ - else e |= kputw((val>>1) - 1, str) < 0; \ - } \ - if (i == 0) e |= kputc('.', str) < 0; \ - } - switch (fmt->type) { - case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break; - case BCF_BT_NULL: e |= kputc('.', str) < 0; break; - default: hts_log_error("Unexpected type %d", fmt->type); return -2; - } - #undef BRANCH - return e == 0 ? 0 : -1; -} /** - * bcf_format_gt1 - formats GT information on a string + * bcf_format_gt_v2 - formats GT information on a string * @param hdr - bcf header, to get version * @param fmt - pointer to bcf format data * @param isample - position of interested sample in data @@ -1569,73 +1515,11 @@ static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) * Explicit / prefixed phasing for 1st allele is used only when it is a must to * correctly express phasing. */ -static inline int bcf_format_gt1(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str) +HTSLIB_EXPORT int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str); + +static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) { - uint32_t e = 0; - bcf_version ver = v42; - int ploidy = 1, anyunphased = 0; - int32_t val0 = 0; - kstring_t tmp1 = KS_INITIALIZE, tmp2 = KS_INITIALIZE; - - if (bcf_get_version(hdr, &ver)) { - hts_log_error("Failed to get version information"); - return -1; - } - #define BRANCH(type_t, convert, missing, vector_end) { \ - uint8_t *ptr = fmt->p + isample*fmt->size; \ - int i; \ - for (i=0; in; i++, ptr += sizeof(type_t)) \ - { \ - type_t val = convert(ptr); \ - if ( val == vector_end ) break; \ - if (!i) { val0 = val; } \ - if (i) { \ - e |= kputc("/|"[val & 1], &tmp1) < 0; \ - anyunphased |= !(val & 1); \ - } \ - if (!(val >> 1)) e |= kputc('.', &tmp1) < 0; \ - else e |= kputw((val >> 1) - 1, &tmp1) < 0; \ - } \ - if (i == 0) e |= kputc('.', &tmp1) < 0; \ - ploidy = i; \ - } - switch (fmt->type) { - case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break; - case BCF_BT_NULL: e |= kputc('.', &tmp1) < 0; break; - default: hts_log_error("Unexpected type %d", fmt->type); return -2; - } - #undef BRANCH - - if (ver >= v44) { //output which supports prefixed phasing - /* update 1st allele's phasing if required and append rest to it. - use prefixed phasing only when it is a must. i.e. without which the - inferred value will be incorrect */ - if (val0 & 1) { - /* 1st one is phased, if ploidy is > 1 and an unphased allele exists - need to specify explicitly */ - e |= (ploidy > 1 && anyunphased) ? - (kputc('|', &tmp2) < 0) : - (ploidy <= 1 && !((val0 >> 1)) ? //|. needs explicit o/p - (kputc('|', &tmp2) < 0) : - 0); - } else { - /* 1st allele is unphased, if ploidy is = 1 or allele is '.' or - ploidy > 1 and no other unphased allele exist, need to specify - explicitly */ - e |= ((ploidy <= 1 && val0 != 0) || (ploidy > 1 && !anyunphased)) ? - (kputc('/', &tmp2) < 0) : - 0; - } - e |= kputsn(tmp1.s, tmp1.l, &tmp2) < 0; //append rest with updated one - ks_free(&tmp1); - tmp1 = tmp2; - } - //updated v44 string or @@ -116,6 +116,7 @@ typedef struct vdict_t dict; // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT hdict_t *gen; // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields size_t *key_len;// length of h->id[BCF_DT_ID] strings + int version; //cached version } bcf_hdr_aux_t; @@ -124,6 +125,69 @@ static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr) return (bcf_hdr_aux_t *)hdr->dict[0]; } +//version macros +#define VCF_DEF 4002000 +#define VCF44 4004000 + +#define VCF_MAJOR_VER(x) ( (x) / 10000 / 100 ) +#define VCF_MINOR_VER(x) ( ((x) % 1000000) / 1000 ) + +/** + * bcf_get_version - get the version as int + * @param hdr - bcf header, to get version + * @param verstr- version string, which is already available + * Returns version on success and default version on failure + * version = major * 100 * 10000 + minor * 1000 + */ +static int bcf_get_version(const bcf_hdr_t *hdr, const char *verstr) +{ + const char *version = NULL, vcf[] = "VCFv"; + char *major = NULL, *minor = NULL; + int ver = -1; + long tmp = 0; + bcf_hdr_aux_t *aux = NULL; + + if (!hdr && !verstr) { //invalid input + goto fail; + } + + if (hdr) { + if ((aux = get_hdr_aux(hdr)) && aux->version != 0) { //use cached version + return aux->version; + } + //get from header + version = bcf_hdr_get_version(hdr); + } else { + //get from version string + version = verstr; + } + if (!(major = strstr(version, vcf))) { //bad format + goto fail; + } + major += sizeof(vcf) - 1; + if (!(minor = strchr(major, '.'))) { //bad format + goto fail; + } + tmp = strtol(major, NULL, 10); + if ((!tmp && errno == EINVAL) || + ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) { //failed + goto fail; + } + ver = tmp * 100 * 10000; + tmp = strtol(++minor, NULL, 10); + if ((!tmp && errno == EINVAL) || + ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) { //failed + goto fail; + } + ver += tmp * 1000; + return ver; + +fail: + hts_log_warning("Couldn't get VCF version, considering as %d.%d", + VCF_MAJOR_VER(VCF_DEF), VCF_MINOR_VER(VCF_DEF)); + return VCF_DEF; +} + static char *find_chrom_header_line(char *s) { char *nl; @@ -985,7 +1049,6 @@ static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec) int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp) { - // currently only for bcf_hdr_set_version assert( hrec->type==BCF_HL_GEN ); int ret; khint_t k; @@ -1014,6 +1077,11 @@ int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp) free(hrec->value); hrec->value = strdup(tmp->value); if ( !hrec->value ) return -1; + + if (!strcmp(hrec->key,"fileformat")) { + //update version + get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value); + } return 0; } @@ -1037,7 +1105,6 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) bcf_hrec_destroy(hrec); return 0; } - // Is one of the generic fields and already present? if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 ) { @@ -1052,6 +1119,9 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) free(str.s); return 0; } + if (!strcmp(hrec->key, "fileformat")) { + aux->version = bcf_get_version(NULL, hrec->value); + } } int i; @@ -1387,6 +1457,8 @@ int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1; hrec = bcf_hdr_parse_line(hdr, str.s, &len); free(str.s); + + get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value); } else { @@ -1420,6 +1492,7 @@ bcf_hdr_t *bcf_hdr_init(const char *mode) if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; } aux->key_len = NULL; aux->dict = *((vdict_t*)h->dict[0]); + aux->version = 0; free(h->dict[0]); h->dict[0] = aux; @@ -1428,6 +1501,7 @@ bcf_hdr_t *bcf_hdr_init(const char *mode) bcf_hdr_append(h, "##fileformat=VCFv4.2"); // The filter PASS must appear first in the dictionary bcf_hdr_append(h, "##FILTER="); + aux->version = VCF_DEF; } return h; @@ -3061,13 +3135,9 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const char *t = q + 1; int m = 0; // m: sample id const int nsamples = bcf_hdr_nsamples(h); - bcf_version ver = v42; const char *end = s->s + s->l; - if (bcf_get_version(h, &ver)) { - hts_log_error("Failed to get version information"); - return -1; - } + int ver = bcf_get_version(h, NULL); while ( t= v44 && (*t == '|' || *t == '/')) { + if (ver >= VCF44 && (*t == '|' || *t == '/')) { // cache prefix and phasing status is_phased = *t++ == '|'; phasingprfx = 1; @@ -3150,7 +3220,7 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, is_phased = (*t == '|'); if (*t != '|' && *t != '/') break; } - if (ver >= v44 && !phasingprfx) { + if (ver >= VCF44 && !phasingprfx) { /* no explicit phasing for 1st allele, set based on other alleles and ploidy */ if (ploidy == 1) { //implicitly phased @@ -4220,7 +4290,7 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) if (!first) kputc_(':', s); first = 0; if (gt_i == i) { - bcf_format_gt1(h, f,j,s); + bcf_format_gt_v2(h, f,j,s); break; } else if (f->n == 1) @@ -5983,3 +6053,84 @@ const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) { return buffer; } +/** + * bcf_format_gt_v2 - formats GT information on a string + * @param hdr - bcf header, to get version + * @param fmt - pointer to bcf format data + * @param isample - position of interested sample in data + * @param str - pointer to output string + * Returns 0 on success and -1 on failure + * This method is extended from bcf_format_gt to output phasing information + * in accordance with v4.4 format, which supports explicit / prefixed phasing + * for 1st allele. + * Explicit / prefixed phasing for 1st allele is used only when it is a must to + * correctly express phasing. + */ +int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str) +{ + uint32_t e = 0; + int ploidy = 1, anyunphased = 0; + int32_t val0 = 0; + kstring_t tmp1 = KS_INITIALIZE, tmp2 = KS_INITIALIZE; + + #define BRANCH(type_t, convert, missing, vector_end) { \ + uint8_t *ptr = fmt->p + isample*fmt->size; \ + int i; \ + for (i=0; in; i++, ptr += sizeof(type_t)) \ + { \ + type_t val = convert(ptr); \ + if ( val == vector_end ) break; \ + if (!i) { val0 = val; } \ + if (i) { \ + e |= kputc("/|"[val & 1], &tmp1) < 0; \ + anyunphased |= !(val & 1); \ + } \ + if (!(val >> 1)) e |= kputc('.', &tmp1) < 0; \ + else e |= kputw((val >> 1) - 1, &tmp1) < 0; \ + } \ + if (i == 0) e |= kputc('.', &tmp1) < 0; \ + ploidy = i; \ + } + switch (fmt->type) { + case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, + bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, + bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, + bcf_int32_vector_end); break; + case BCF_BT_NULL: e |= kputc('.', &tmp1) < 0; break; + default: hts_log_error("Unexpected type %d", fmt->type); return -2; + } + #undef BRANCH + + if (get_hdr_aux(hdr)->version >= VCF44) { + //output which supports prefixed phasing + + /* update 1st allele's phasing if required and append rest to it. + use prefixed phasing only when it is a must. i.e. without which the + inferred value will be incorrect */ + if (val0 & 1) { + /* 1st one is phased, if ploidy is > 1 and an unphased allele exists + need to specify explicitly */ + e |= (ploidy > 1 && anyunphased) ? + (kputc('|', &tmp2) < 0) : + (ploidy <= 1 && !((val0 >> 1)) ? //|. needs explicit o/p + (kputc('|', &tmp2) < 0) : + 0); + } else { + /* 1st allele is unphased, if ploidy is = 1 or allele is '.' or + ploidy > 1 and no other unphased allele exist, need to specify + explicitly */ + e |= ((ploidy <= 1 && val0 != 0) || (ploidy > 1 && !anyunphased)) ? + (kputc('/', &tmp2) < 0) : + 0; + } + e |= kputsn(tmp1.s, tmp1.l, &tmp2) < 0; //append rest with updated one + ks_free(&tmp1); + tmp1 = tmp2; + } + //updated v44 string or