diff --git a/src/PangenomeMATV2.cpp b/src/PangenomeMATV2.cpp index 9111fdb9..400827db 100644 --- a/src/PangenomeMATV2.cpp +++ b/src/PangenomeMATV2.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -3526,6 +3527,11 @@ void PangenomeMAT2::Tree::fillMutationMats(statsgenotype::mutationMatrices& mutm } void printMatrices(const statsgenotype::mutationMatrices& mutmat) { + for (const auto& count : mutmat.total_submuts) { + cout << count << endl; + } + cout << endl; + for (const auto& row : mutmat.submat) { for (const auto& count : row) { cout << count << " "; @@ -3534,15 +3540,14 @@ void printMatrices(const statsgenotype::mutationMatrices& mutmat) { } cout << endl; - // for (const auto& count : mutmat.total_submuts) { - // cout << count << " "; - // } - // cout << endl; + cout << mutmat.total_delmut << endl; for (const auto& count : mutmat.delmat) { cout << count << " "; } cout << endl; + + cout << mutmat.total_insmut << endl; for (const auto& count : mutmat.insmat) { cout << count << " "; } @@ -3589,7 +3594,9 @@ int parse_readbases( indel_size = stoul(matches.str(1)); indel_size_len = matches.length(1); indel_size_idx = matches.position(1); - insertion_seqs.push_back(readbase_substr.substr(indel_size_idx + indel_size_len, indel_size)); + string seq = readbase_substr.substr(indel_size_idx + indel_size_len, indel_size); + boost::to_upper(seq); + insertion_seqs.push_back(seq); ins_errs += readbase_errors[cur_idx]; cur_start += (indel_size_idx + indel_size_len + indel_size); cur_idx++; @@ -3650,15 +3657,15 @@ void PangenomeMAT2::Tree::printVCFGenotypeStats(std::ifstream& fin) { auto variation_types = parse_readbases(readbases_string, readbases_errors, ref_nuc, nucs, insertion_seqs, deletion_sizes, errs); candidate_variants.emplace_back(statsgenotype::variationSite( site_id, ref_nuc, position, variation_types, nucs, - insertion_seqs, deletion_sizes, errs + insertion_seqs, deletion_sizes, errs, mutmat )); site_id++; } } - // for (const auto& site : candidate_variants) { - // statsgenotype::printSiteGenotypeLikelihoods(site); - // } + for (const auto& site : candidate_variants) { + statsgenotype::printSiteGenotypePosteriors(site); + } } diff --git a/src/statsgenotype.hpp b/src/statsgenotype.hpp index c1479b8c..f5e12a20 100644 --- a/src/statsgenotype.hpp +++ b/src/statsgenotype.hpp @@ -100,9 +100,7 @@ static double likelihood( genotype_probs.push_back(phred_complement(prob)); } } else { - variants_probs.insert(variants_probs.end(), row.begin(), row.end()); - } } @@ -144,15 +142,15 @@ static vector genotype_likelihoods( ) { vector likelihoods; likelihoods.resize(5); - for (auto& prob : likelihoods) { - prob = numeric_limits::max(); + for (auto i = 0; i < 5; ++i) { + likelihoods[i] = numeric_limits::max(); } auto variation_types = site_info & 7; - + auto ref_nuc = site_info >> 3; for (size_t i = 0; i < read_errs.size(); ++i) { - if (read_errs[i].empty()) { continue; } + if (read_errs[i].empty() && i != ref_nuc) { continue; } likelihoods[i] = likelihood(i, read_errs, deletions, insertions, variationType::SNP); } @@ -171,10 +169,47 @@ static vector genotype_likelihoods( return likelihoods; } +static vector genotype_posteriors( + const vector& likelihoods, const map >& deletions, + const map >& insertions, const int8_t& site_info, const mutationMatrices& mutmat +) { + vector posteriors; + posteriors.resize(4); + for (auto i = 0; i < 4; i++) { + posteriors[i] = numeric_limits::max(); + } + + auto ref_nuc = site_info >> 3; + for (auto i = 0; i < 4; ++i) { + if (likelihoods[i] != numeric_limits::max()) { + posteriors[i] = likelihoods[i] + mutmat.submat[ref_nuc][i]; + } + } + + size_t insertion_idx = 0; + for (const auto& insertion : insertions) { + posteriors.push_back(likelihoods[5 + insertion_idx] + mutmat.insmat[insertion.first.size()]); + insertion_idx++; + } + + size_t deletion_idx = 0; + for (const auto& deletion : deletions) { + posteriors.push_back(likelihoods[5 + insertion_idx + deletion_idx] + mutmat.insmat[deletion.first]); + insertion_idx++; + } + + double min_score = *min_element(posteriors.begin(), posteriors.end()); + for (int i = 0; i < posteriors.size(); ++i) { + posteriors[i] -= min_score; + } + return posteriors; +} + struct variationSite { variationSite( size_t sid, char ref, size_t position, int variation_types, const string& nucs, - const vector& insertion_seqs, const vector& deletion_lens, const string& errors + const vector& insertion_seqs, const vector& deletion_lens, const string& errors, + const mutationMatrices& mutmat ) { site_id = sid; ref_position = position; @@ -204,6 +239,7 @@ struct variationSite { } likelihoods = genotype_likelihoods(read_errs, deletions, insertions, site_info); + posteriors = genotype_posteriors(likelihoods, deletions, insertions, site_info, mutmat); } size_t site_id; @@ -222,7 +258,7 @@ struct variationSite { map > insertions; vector likelihoods; - + vector posteriors; }; static void printSiteGenotypeLikelihoods(const variationSite& site) { @@ -247,6 +283,40 @@ static void printSiteGenotypeLikelihoods(const variationSite& site) { cout << endl; } +static void printSiteGenotypePosteriors(const variationSite& site) { + auto ref_nuc_idx = site.site_info >> 3; + + bool no_print = true; + for (int i = 0; i < site.posteriors.size(); i++) { + if (i != ref_nuc_idx && site.posteriors[i] != numeric_limits::max()) { + no_print = false; + } + } + if (no_print) { + return; + } + + cout << site.ref_position << '\t' << getNucleotideFromIndex(ref_nuc_idx) << "\t"; + for (int i = 0; i < 4; i++) { + if (site.posteriors[i] != numeric_limits::max()) { + cout << getNucleotideFromIndex(i) << ":" << site.posteriors[i] << "\t"; + } + } + + size_t insertion_idx = 0; + for (const auto& insertion : site.insertions) { + cout << "+" << insertion.first.size() << insertion.first << ":" << site.posteriors[4 + insertion_idx] << "\t"; + insertion_idx++; + } + + size_t deletion_idx = 0; + for (const auto& deletion : site.deletions) { + cout << "-" << deletion.first << ":" << site.posteriors[4 + site.insertions.size() + deletion_idx] << "\t"; + deletion_idx++; + } + cout << endl; +} + }