Skip to content

Commit

Permalink
update for P, extended string split, PSL, PSO, PSQ
Browse files Browse the repository at this point in the history
  • Loading branch information
vasudeva8 committed Aug 14, 2024
1 parent b0b623d commit 8125b09
Show file tree
Hide file tree
Showing 11 changed files with 616 additions and 71 deletions.
34 changes: 34 additions & 0 deletions inc/util/string_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,40 @@ namespace ebi
output.swap(ret);
}

/** extended version of string_split
* Splits `s` using `delims` as separator and fills the container `ret` with the parts.
* The delimiter can be retained based on parameter @withdelim
* An empty string results in an empty container `ret`.
* With false for withdelim, 1st leading delimiter will result in an empty string (/1 -> "", "1")
* @param s input string to split
* @param delims any character here acts as a separator
* @param withdelim - true to retain delimiter and false to split without delimiter
* @param ret return by reference the container filled with the string split.
*/
template<typename C>
void string_split_ex(std::string const & s, char const * delims, C & ret, bool withdelim)
{
C output;

if (s.size() > 0) {
char const* p = s.c_str();
char const* q = strpbrk(p + (withdelim ? 1 : 0), delims);

// Insert first to last-1 elements
for( ; q != NULL; q = strpbrk(p + (withdelim ? 1 : 0), delims) )
{
output.push_back(typename C::value_type(p, q));
p = q + (withdelim ? 0 : 1);
}

// Insert last element
if (p < &(s.back()) + 1) {
output.push_back(typename C::value_type(p));
}
}

output.swap(ret);
}
/**
* Temporal implementation for mismatch with 2 starts and 2 ends. It is not in STL for c++11, but it will in c++14.
*
Expand Down
12 changes: 11 additions & 1 deletion inc/vcf/file_structure.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@ namespace ebi
* (e.g. with 1 reference, 2 alternate alleles (3 total alleles) and ploidy 2, it's 3 + 2 -1 choose 2, which is 6: 00, 01, 11, 02, 12, 22)
* - "." means unknown number of elements
* - number is a positive number [0, +inf)
* - "P" is the allele in GT - TODO assumes tobe same as ploidy
* - "P" is the alleles in GT
* @param alternate_allele_number the number of alternate alleles
* @param ploidy is the number of copies of a chromosome in a sample, so a given genotype in said chromosome needs `ploidy` alleles to be completely specified
* @param expected_cardinality return by reference [0, +inf) for valid numbers. -1 if unknown number.
Expand Down Expand Up @@ -750,6 +750,16 @@ namespace ebi
* Checks specific SV alleles have same SVLEN when format CN is present
*/
void check_format_allele_SVLEN() const;

/**
* Gets alleles from GT with phasing information
*/
void get_phased_alleles(std::string GT, std::vector<std::string>& alleles) const;

/**
* Gets PSL values from samples
*/
void get_PSL_values(size_t i, std::vector<std::string>& pslvalues) const;
};

std::ostream &operator<<(std::ostream &os, const Record &record);
Expand Down
2 changes: 1 addition & 1 deletion inc/vcf/meta_entry_visitor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ namespace ebi
void check_filter(std::map<std::string, std::string> & value) const;
void check_filter_id(std::string const & id_field) const;
void check_format(std::map<std::string, std::string> & value) const;
void check_format_or_info_number(std::string const & number_field, std::string const & field) const;
void check_format_or_info_number(std::string const & number_field, std::string const & field, bool isinfo) const;
void check_format_type(std::string const & type_field) const;
void check_info(std::map<std::string, std::string> & value) const;
void check_info_type(std::string const & type_field) const;
Expand Down
21 changes: 11 additions & 10 deletions inc/vcf/validator_detail_v44.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
*/


#line 266 "src/vcf/vcf_v44.ragel"
#line 267 "src/vcf/vcf_v44.ragel"


namespace
Expand Down Expand Up @@ -178,7 +178,7 @@ static const int vcf_v44_en_meta_section_skip = 623;
static const int vcf_v44_en_body_section_skip = 624;


#line 272 "src/vcf/vcf_v44.ragel"
#line 273 "src/vcf/vcf_v44.ragel"

}

Expand All @@ -197,7 +197,7 @@ namespace ebi
cs = vcf_v44_start;
}

#line 286 "src/vcf/vcf_v44.ragel"
#line 287 "src/vcf/vcf_v44.ragel"

}

Expand Down Expand Up @@ -1712,6 +1712,7 @@ case 148:
case 46: goto tr240;
case 65: goto tr240;
case 71: goto tr240;
case 80: goto tr240;
case 82: goto tr240;
}
if ( 48 <= (*p) && (*p) <= 57 )
Expand Down Expand Up @@ -7666,7 +7667,7 @@ case 637:
case 71:
#line 36 "src/vcf/vcf_v44.ragel"
{
ErrorPolicy::handle_error(*this, new MetaSectionError{n_lines, "FORMAT metadata Number is not a number, A, R, G or dot"});
ErrorPolicy::handle_error(*this, new MetaSectionError{n_lines, "FORMAT metadata Number is not a number, A, R, G, P or dot"});
p--; {cs = 623;goto _again;}
}
break;
Expand All @@ -7685,14 +7686,14 @@ case 637:
}
break;
case 75:
#line 264 "src/vcf/vcf_v44.ragel"
#line 265 "src/vcf/vcf_v44.ragel"
{ {cs = 28;goto _again;} }
break;
case 76:
#line 265 "src/vcf/vcf_v44.ragel"
#line 266 "src/vcf/vcf_v44.ragel"
{ {cs = 629;goto _again;} }
break;
#line 7696 "inc/vcf/validator_detail_v44.hpp"
#line 7697 "inc/vcf/validator_detail_v44.hpp"
}
}
goto _again;
Expand Down Expand Up @@ -8143,7 +8144,7 @@ goto _again;}
case 71:
#line 36 "src/vcf/vcf_v44.ragel"
{
ErrorPolicy::handle_error(*this, new MetaSectionError{n_lines, "FORMAT metadata Number is not a number, A, R, G or dot"});
ErrorPolicy::handle_error(*this, new MetaSectionError{n_lines, "FORMAT metadata Number is not a number, A, R, G, P or dot"});
p--; {cs = 623; if ( p == pe )
goto _test_eof;
goto _again;}
Expand Down Expand Up @@ -8176,15 +8177,15 @@ goto _again;}
goto _again;}
}
break;
#line 8180 "inc/vcf/validator_detail_v44.hpp"
#line 8181 "inc/vcf/validator_detail_v44.hpp"
}
}
}

_out: {}
}

#line 294 "src/vcf/vcf_v44.ragel"
#line 295 "src/vcf/vcf_v44.ragel"

}

Expand Down
13 changes: 8 additions & 5 deletions src/vcf/meta_entry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ namespace ebi
check_key_is_present(FORMAT, TYPE, value.count(TYPE));
check_key_is_present(FORMAT, DESCRIPTION, value.count(DESCRIPTION));

check_format_or_info_number(value[NUMBER], FORMAT);
check_format_or_info_number(value[NUMBER], FORMAT, false);
check_format_type(value[TYPE]);

if (entry.source->version == Version::v41 || entry.source->version == Version::v42) {
Expand All @@ -194,14 +194,17 @@ namespace ebi
}
}

void MetaEntryVisitor::check_format_or_info_number(std::string const & number_field, std::string const & field) const
void MetaEntryVisitor::check_format_or_info_number(std::string const & number_field, std::string const & field, bool isinfo) const
{
bool checkP = entry.source->version >= Version::v44;
if (util::contains_if(number_field, [](char c) { return !isdigit(c); }) &&
number_field != A &&
number_field != R &&
number_field != G &&
number_field != UNKNOWN_CARDINALITY) {
throw new MetaSectionError{entry.line, field + " metadata Number is not a number, A, R, G or dot"};
number_field != UNKNOWN_CARDINALITY &&
(isinfo || !checkP || (number_field != P && !isinfo && checkP))) {
throw new MetaSectionError{entry.line, field + " metadata Number is not a number, A, R, G" +
((!isinfo && checkP) ? ", P" : "") + " or dot"};
}
}

Expand All @@ -223,7 +226,7 @@ namespace ebi
check_key_is_present(INFO, TYPE, value.count(TYPE));
check_key_is_present(INFO, DESCRIPTION, value.count(DESCRIPTION));

check_format_or_info_number(value[NUMBER], INFO);
check_format_or_info_number(value[NUMBER], INFO, true);
check_info_type(value[TYPE]);

if (entry.source->version == Version::v41 || entry.source->version == Version::v42) {
Expand Down
83 changes: 82 additions & 1 deletion src/vcf/record.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -874,6 +874,48 @@ namespace ebi
}
}
}
if (source->version < Version::v44) { //not valid for < v44
return;
}
if (field_key == PSL) {
if (format[0] == GT) {
std::string::size_type pos = samples[i].find(':');
std::string GT_subfield = samples[i];
std::vector<std::string> alleles;
if (pos != std::string::npos) {
GT_subfield = samples[i].substr(0, pos);
}
get_phased_alleles(GT_subfield, alleles); //allele or unknown(.)
for (size_t i = 0; i < values.size(); ++i) {
if (alleles[i].at(0) == '/' && values[i] != MISSING_VALUE) {
//un-phased allele must have '.' in PSL
throw new SamplesFieldBodyError{line, message + " at " +
std::to_string(i+1) + " to be '.'", "", field_key};
}
}
}
} else if (field_key == PSO) {
//needs caching of records for detailed validation, which is not efficient - skipped
std::vector<std::string> pslvals;
get_PSL_values(i, pslvals);
for (int i = 0; i < pslvals.size(); ++i) {
if (pslvals[i] == MISSING_VALUE && values[i] != MISSING_VALUE) {
//when psl is missing val, pso has to be missing as well
throw new SamplesFieldBodyError{line, message + " at " + std::to_string(i+1) +
" to be '.' as corresponding PSL is missing", "", field_key};
}
}
} else if (field_key == PSQ) {
std::vector<std::string> pslvals;
get_PSL_values(i, pslvals);
for (int i = 0; i < pslvals.size(); ++i) {
if (pslvals[i] == MISSING_VALUE && values[i] != MISSING_VALUE) {
//when psl is missing val, psq has to be missing as well
throw new SamplesFieldBodyError{line, message + " at " + std::to_string(i+1) +
" to be '.' as corresponding PSL is missing", "", field_key};
}
}
}
}

void Record::check_sample_alleles(std::vector<std::string> const & subfields) const
Expand Down Expand Up @@ -942,7 +984,6 @@ namespace ebi
// ...it is unspecified
expected_cardinality = -1;
} else if (number == P && !isinfo) { //invalid for info data
// TODO: using ploidy, if it is unique alleles then need to parse GT field
expected_cardinality = ploidy;
} else {
// ...specified as a number in range [0, +MAX_LONG)
Expand Down Expand Up @@ -1162,6 +1203,46 @@ namespace ebi
}
}

void Record::get_PSL_values(size_t i, std::vector<std::string>& pslvalues) const
{
std::vector<std::string> samplevals;
const auto &psl = std::find(format.begin(), format.end(), PSL);
if (psl == format.end()) {
return; //PSL not found
}
size_t offset = psl - format.begin(); //position of PSL
//already field count checked data
util::string_split(samples[i], ":", samplevals);
util::string_split(samplevals[offset], ",", pslvalues);
}
void Record::get_phased_alleles(std::string GT, std::vector<std::string>& alleles) const
{
std::string delims("|/");
bool anyphased = false;
bool first = true;
std::vector<std::string> values;

if (!GT.size()) {
return;
}

if (GT.find('|') != std::string::npos) {
anyphased = true;
}

util::string_split_ex(GT, delims.c_str(), values, true);
//check and assign phasing for 1st allele, if missing
auto allele = values.begin();
if (allele != values.end()) {
if (allele->at(0) != '/' && allele->at(0) != '|') {
//infer phasing based on other alleles phasing
allele->insert(0, anyphased ? "|" : "/");
}
//alleles.insert(alleles.begin(), values.begin(), values.end());
alleles.swap(values);
}
}

bool is_record_subfield_in_header(std::string const & field_value,
std::multimap<std::string, MetaEntry>::iterator begin,
std::multimap<std::string, MetaEntry>::iterator end)
Expand Down
20 changes: 1 addition & 19 deletions src/vcf/validate_optional_policy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,6 @@ namespace ebi

void ValidateOptionalPolicy::check_body_entry_info_svlen(ParsingState & state, Record const & record) const
{
//static boost::regex cnchk_regex("(<(CNV|DUP|DEL)(:[^>]+)*>)+");
static boost::regex svchk_regex("(<(INS|DUP|INV|DEL|CNV)(:[^>]+)*>)+");
static boost::regex non_symbolic_alt_regex("[ACGTN]+", boost::regex::icase);
std::string svlenval;
Expand All @@ -197,7 +196,6 @@ namespace ebi
if (record.source->version < Version::v44) {
return;
}
//auto itcn = std::find(record.format.begin(), record.format.end(), CN);
for (auto i = 0; i < record.alternate_alleles.size(); ++i) {
//SVLEN should be '.' for non SV alleles
if (boost::regex_match(record.alternate_alleles[i], non_symbolic_alt_regex) ||
Expand All @@ -207,22 +205,6 @@ namespace ebi
throw new InfoBodyError{state.n_lines, "INFO SVLEN should be " + MISSING_VALUE + " for alleles other than structural variant INS/INV/DUP/DEL/CNV"};
}
}
/*if (itcn != record.format.end()) {
//with CN in format, CNV/DEL/DUP should have the same SVLEN value, v4.4 onwards
if (record.types[i] != RecordType::STRUCTURAL || !boost::regex_match(record.alternate_alleles[i], cnchk_regex)) {
continue;
}
if (!svlenval.size()) {
svlenval = values[i]; //first
continue;
}
//CNV/DEL/DUP, should have the same SVLEN
if (svlenval != values[i]) {
throw new InfoBodyError{state.n_lines,
"INFO SVLEN should have same values for SV CNV/DEL/DUP", "Expected " + svlenval
+ ", found " + values[i]};
}
}*/
}
}
}
Expand Down Expand Up @@ -515,7 +497,7 @@ namespace ebi
if (values.size() % 2 != 0) { //CI should have even count
std::string message = "Sample #" + std::to_string(offset + 1) + ", field " + confidence_interval_tag +
" does not have even count";
throw new SamplesFieldBodyError{state.n_lines, message, "", confidence_interval_tag}; //TODO checl line is good or is it state.line
throw new SamplesFieldBodyError{state.n_lines, message, "", confidence_interval_tag};
}
for (int i = 0; i < values.size(); i += 2) {
size_t scanned_first_value_length = 1, scanned_second_value_length = 1;
Expand Down
9 changes: 5 additions & 4 deletions src/vcf/vcf_v44.ragel
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

# FORMAT metadata
action meta_format_number_err {
ErrorPolicy::handle_error(*this, new MetaSectionError{n_lines, "FORMAT metadata Number is not a number, A, R, G or dot"});
ErrorPolicy::handle_error(*this, new MetaSectionError{n_lines, "FORMAT metadata Number is not a number, A, R, G, P or dot"});
fhold; fgoto meta_section_skip;
}

Expand Down Expand Up @@ -88,7 +88,8 @@
meta_values = ('"' meta_field_desc '"') | meta_field_value ;
meta_field = meta_key '=' meta_values ;

meta_field_num = ( (digit)+ | 'A' | 'R' | 'G' | '.' ) >token_begin @token_middle %token_end ;
meta_field_info_num = ( (digit)+ | 'A' | 'R' | 'G' | '.' ) >token_begin @token_middle %token_end ;
meta_field_format_num = ( (digit)+ | 'A' | 'R' | 'G' | '.' | 'P' ) >token_begin @token_middle %token_end ;
meta_field_type = (alpha)+ >token_begin @token_middle %token_end ;

meta_alt = 'ID=' %meta_id alt_id >token_begin @token_middle %token_end $err(meta_alt_id_err)
Expand All @@ -103,13 +104,13 @@
(',' identifier $err(meta_id_err) '="' meta_field_desc '"' $err(meta_desc_err))* ;

meta_format = 'ID=' %meta_id identifier $err(meta_id_err)
',Number=' %meta_number meta_field_num $err(meta_format_number_err)
',Number=' %meta_number meta_field_format_num $err(meta_format_number_err)
',Type=' %meta_type meta_field_type $err(meta_info_type_err)
',Description=' %meta_description '"' meta_field_desc '"' $err(meta_desc_err)
(',' identifier $err(meta_id_err) '="' meta_field_desc '"' $err(meta_desc_err))* ;

meta_info = 'ID=' %meta_id identifier $err(meta_id_err)
',Number=' %meta_number meta_field_num $err(meta_info_number_err)
',Number=' %meta_number meta_field_info_num $err(meta_info_number_err)
',Type=' %meta_type meta_field_type $err(meta_info_type_err)
',Description=' %meta_description '"' meta_field_desc '"' $err(meta_desc_err)
(',' identifier $err(meta_id_err) '="' meta_field_desc '"' $err(meta_desc_err))* ;
Expand Down
Loading

0 comments on commit 8125b09

Please sign in to comment.