Skip to content

Commit

Permalink
up
Browse files Browse the repository at this point in the history
  • Loading branch information
AlanZhangUCSC committed Dec 1, 2023
1 parent 66ead2c commit 8a631a9
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 17 deletions.
25 changes: 16 additions & 9 deletions src/PangenomeMATV2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <tbb/parallel_for_each.h>
#include <tbb/concurrent_vector.h>
#include <tbb/concurrent_unordered_map.h>
#include <boost/algorithm/string.hpp>
#include <ctime>
#include <iomanip>
#include <mutex>
Expand Down Expand Up @@ -3526,6 +3527,11 @@ void PangenomeMAT2::Tree::fillMutationMats(statsgenotype::mutationMatrices& mutm
}

void printMatrices(const statsgenotype::mutationMatrices& mutmat) {
for (const auto& count : mutmat.total_submuts) {
cout << count << endl;
}
cout << endl;

for (const auto& row : mutmat.submat) {
for (const auto& count : row) {
cout << count << " ";
Expand All @@ -3534,15 +3540,14 @@ void printMatrices(const statsgenotype::mutationMatrices& mutmat) {
}
cout << endl;

// for (const auto& count : mutmat.total_submuts) {
// cout << count << " ";
// }
// cout << endl;

cout << mutmat.total_delmut << endl;
for (const auto& count : mutmat.delmat) {
cout << count << " ";
}
cout << endl;

cout << mutmat.total_insmut << endl;
for (const auto& count : mutmat.insmat) {
cout << count << " ";
}
Expand Down Expand Up @@ -3589,7 +3594,9 @@ int parse_readbases(
indel_size = stoul(matches.str(1));
indel_size_len = matches.length(1);
indel_size_idx = matches.position(1);
insertion_seqs.push_back(readbase_substr.substr(indel_size_idx + indel_size_len, indel_size));
string seq = readbase_substr.substr(indel_size_idx + indel_size_len, indel_size);
boost::to_upper(seq);
insertion_seqs.push_back(seq);
ins_errs += readbase_errors[cur_idx];
cur_start += (indel_size_idx + indel_size_len + indel_size);
cur_idx++;
Expand Down Expand Up @@ -3650,15 +3657,15 @@ void PangenomeMAT2::Tree::printVCFGenotypeStats(std::ifstream& fin) {
auto variation_types = parse_readbases(readbases_string, readbases_errors, ref_nuc, nucs, insertion_seqs, deletion_sizes, errs);
candidate_variants.emplace_back(statsgenotype::variationSite(
site_id, ref_nuc, position, variation_types, nucs,
insertion_seqs, deletion_sizes, errs
insertion_seqs, deletion_sizes, errs, mutmat
));
site_id++;
}
}

// for (const auto& site : candidate_variants) {
// statsgenotype::printSiteGenotypeLikelihoods(site);
// }
for (const auto& site : candidate_variants) {
statsgenotype::printSiteGenotypePosteriors(site);
}


}
Expand Down
86 changes: 78 additions & 8 deletions src/statsgenotype.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,7 @@ static double likelihood(
genotype_probs.push_back(phred_complement(prob));
}
} else {

variants_probs.insert(variants_probs.end(), row.begin(), row.end());

}
}

Expand Down Expand Up @@ -144,15 +142,15 @@ static vector<double> genotype_likelihoods(
) {
vector<double> likelihoods;
likelihoods.resize(5);
for (auto& prob : likelihoods) {
prob = numeric_limits<double>::max();
for (auto i = 0; i < 5; ++i) {
likelihoods[i] = numeric_limits<double>::max();
}

auto variation_types = site_info & 7;

auto ref_nuc = site_info >> 3;

for (size_t i = 0; i < read_errs.size(); ++i) {
if (read_errs[i].empty()) { continue; }
if (read_errs[i].empty() && i != ref_nuc) { continue; }
likelihoods[i] = likelihood(i, read_errs, deletions, insertions, variationType::SNP);
}

Expand All @@ -171,10 +169,47 @@ static vector<double> genotype_likelihoods(
return likelihoods;
}

static vector<double> genotype_posteriors(
const vector<double>& likelihoods, const map<size_t, vector<double> >& deletions,
const map<string, vector<double> >& insertions, const int8_t& site_info, const mutationMatrices& mutmat
) {
vector<double> posteriors;
posteriors.resize(4);
for (auto i = 0; i < 4; i++) {
posteriors[i] = numeric_limits<double>::max();
}

auto ref_nuc = site_info >> 3;
for (auto i = 0; i < 4; ++i) {
if (likelihoods[i] != numeric_limits<double>::max()) {
posteriors[i] = likelihoods[i] + mutmat.submat[ref_nuc][i];
}
}

size_t insertion_idx = 0;
for (const auto& insertion : insertions) {
posteriors.push_back(likelihoods[5 + insertion_idx] + mutmat.insmat[insertion.first.size()]);
insertion_idx++;
}

size_t deletion_idx = 0;
for (const auto& deletion : deletions) {
posteriors.push_back(likelihoods[5 + insertion_idx + deletion_idx] + mutmat.insmat[deletion.first]);
insertion_idx++;
}

double min_score = *min_element(posteriors.begin(), posteriors.end());
for (int i = 0; i < posteriors.size(); ++i) {
posteriors[i] -= min_score;
}
return posteriors;
}

struct variationSite {
variationSite(
size_t sid, char ref, size_t position, int variation_types, const string& nucs,
const vector<string>& insertion_seqs, const vector<size_t>& deletion_lens, const string& errors
const vector<string>& insertion_seqs, const vector<size_t>& deletion_lens, const string& errors,
const mutationMatrices& mutmat
) {
site_id = sid;
ref_position = position;
Expand Down Expand Up @@ -204,6 +239,7 @@ struct variationSite {
}

likelihoods = genotype_likelihoods(read_errs, deletions, insertions, site_info);
posteriors = genotype_posteriors(likelihoods, deletions, insertions, site_info, mutmat);
}

size_t site_id;
Expand All @@ -222,7 +258,7 @@ struct variationSite {
map<string, vector<double> > insertions;

vector<double> likelihoods;

vector<double> posteriors;
};

static void printSiteGenotypeLikelihoods(const variationSite& site) {
Expand All @@ -247,6 +283,40 @@ static void printSiteGenotypeLikelihoods(const variationSite& site) {
cout << endl;
}

static void printSiteGenotypePosteriors(const variationSite& site) {
auto ref_nuc_idx = site.site_info >> 3;

bool no_print = true;
for (int i = 0; i < site.posteriors.size(); i++) {
if (i != ref_nuc_idx && site.posteriors[i] != numeric_limits<double>::max()) {
no_print = false;
}
}
if (no_print) {
return;
}

cout << site.ref_position << '\t' << getNucleotideFromIndex(ref_nuc_idx) << "\t";
for (int i = 0; i < 4; i++) {
if (site.posteriors[i] != numeric_limits<double>::max()) {
cout << getNucleotideFromIndex(i) << ":" << site.posteriors[i] << "\t";
}
}

size_t insertion_idx = 0;
for (const auto& insertion : site.insertions) {
cout << "+" << insertion.first.size() << insertion.first << ":" << site.posteriors[4 + insertion_idx] << "\t";
insertion_idx++;
}

size_t deletion_idx = 0;
for (const auto& deletion : site.deletions) {
cout << "-" << deletion.first << ":" << site.posteriors[4 + site.insertions.size() + deletion_idx] << "\t";
deletion_idx++;
}
cout << endl;
}

}


Expand Down

0 comments on commit 8a631a9

Please sign in to comment.