diff --git a/mutation_matrices/sars_4k.mutmat b/mutation_matrices/sars_4k.mutmat new file mode 100644 index 00000000..4c96ea2d --- /dev/null +++ b/mutation_matrices/sars_4k.mutmat @@ -0,0 +1,6 @@ +0.000359671 46.6273 44.5609 45.8359 +44.8418 0.000986932 48.5409 37.4368 +43.3582 47.9474 0.000479415 43.1704 +46.9954 42.4279 47.9589 0.00040456 +0.000314554 46.3982 52.0032 50.0921 57.1724 56.8862 45.3096 +0.000151323 49.717 52.9112 54.0499 58.4771 59.8264 48.9764 diff --git a/src/statsgenotype.hpp b/src/statsgenotype.hpp index 0e8aed4f..b99c8de6 100644 --- a/src/statsgenotype.hpp +++ b/src/statsgenotype.hpp @@ -335,15 +335,21 @@ static void printSiteGenotypePosteriors(const variationSite& site) { } static void printVCFLine(const statsgenotype::variationSite& site) { + size_t position = site.ref_position + 1; + int ref_nuc_idx = site.site_info >> 3; + size_t readDepth = 0; + vector altAlleles; + vector ad; + vector pl; string refAllele; - vector alleles; int gt; - vector dp; - vector pl; - size_t position; - refAllele += getNucleotideFromIndex(site.site_info >> 3); - position = site.ref_position + 1; + int quality = site.likelihoods[ref_nuc_idx]; + for (const auto& depth : site.read_depth) { + readDepth += depth; + } + + refAllele += getNucleotideFromIndex(ref_nuc_idx); // find longest deletion size_t ldl = 0; // longest deletion length @@ -356,61 +362,82 @@ static void printVCFLine(const statsgenotype::variationSite& site) { } if (!lds.empty()) { - defAllele += lds; + refAllele += lds; } - - // string refAllele; - // string altAllele; - // size_t position; - - // refAllele += getNucleotideFromIndex(site.site_info >> 3); - // position = site.ref_position + 1; - - // bool no_print = true; - // for (int i = 0; i < site.posteriors.size(); i++) { - // if (i != (site.site_info >> 3) && site.posteriors[i] != numeric_limits::max()) { - // no_print = false; - // } - // } - // if (no_print || site.most_probable_idx == (site.site_info >> 3)) { - // return; - // } - - // if (site.most_probable_idx <= 3) { - // altAllele += getNucleotideFromIndex(site.most_probable_idx); - // } else { - // if (site.most_probable_idx <= 3 + site.insertions.size()) { - // int idx = 0; - // for (const auto& ins : site.insertions) { - // if (idx == site.most_probable_idx - 4) { - // altAllele += (refAllele + ins.first); - // } - // idx += 1; - // } - // } else { - // int idx = 0; - // for (const auto& del : site.deletions) { - // if (idx == site.most_probable_idx - site.insertions.size() - 4) { - // altAllele = refAllele; - // refAllele += del.first; - // } - // idx += 1; - // } - // } - // } - - - - cout << "ref" << "\t" // #CHROM - << position << "\t" // POS - << "." << "\t" // ID - << refAllele << "\t" // REF - << altAllele << "\t" // ALT - << "." << "\t" // QUAL - << "." << "\t" // FILTER - << "." << "\t" // INFO - << "GT:DP:PL" << "\t" // FORMAT - << "sample" << endl; // SAMPLE + + // depth and likelihood for reference + ad.push_back(site.read_depth[ref_nuc_idx]); + pl.push_back(site.posteriors[ref_nuc_idx]); + if (pl.back() == 0.0) { + gt = 0; + } + + // substitutions + for (int i = 0; i < 4; i++) { + if (i == ref_nuc_idx) { + continue; + } + if (site.posteriors[i] != numeric_limits::max()) { + altAlleles.push_back(getNucleotideFromIndex(i) + lds); + ad.push_back(site.read_depth[i]); + pl.push_back(site.posteriors[i]); + if (pl.back() == 0.0) { + gt = altAlleles.size(); + } + } + } + + // insertions + size_t indelIdx = 0; + for (const auto& ins : site.insertions) { + altAlleles.push_back(getNucleotideFromIndex(ref_nuc_idx) + ins.first + lds); + ad.push_back(site.read_depth[4 + indelIdx]); + pl.push_back(site.posteriors[4 + indelIdx]); + if (pl.back() == 0.0) { + gt = altAlleles.size(); + } + indelIdx += 1; + } + + // deletions + for (const auto& del : site.deletions) { + size_t delSize = del.first.size(); + if (delSize == ldl) { + altAlleles.push_back(refAllele.substr(0, 1)); + } else { + altAlleles.push_back(getNucleotideFromIndex(ref_nuc_idx) + lds.substr(delSize, ldl - delSize)); + } + ad.push_back(site.read_depth[4 + indelIdx]); + pl.push_back(site.posteriors[4 + indelIdx]); + if (pl.back() == 0.0) { + gt = altAlleles.size(); + } + indelIdx += 1; + } + + + + cout << "ref" << "\t" // #CHROM + << position << "\t" // POS + << "." << "\t" // ID + << refAllele << "\t"; // REF + for (size_t i = 0; i < altAlleles.size() - 1; i++) { + cout << altAlleles[i] << ","; + } + cout << altAlleles[altAlleles.size() - 1] << "\t"; // ALT + cout << quality << "\t" // QUAL + << "." << "\t" // FILTER + << "DP=" << readDepth << "\t" // INFO + << "GT:AD:PL" << "\t" // FORMAT + << gt << ":"; // SAMPLE + for (size_t i = 0; i < ad.size() - 1; i++) { + cout << ad[i] << ","; + } + cout << ad[ad.size() - 1] << ":"; + for (size_t i = 0; i < pl.size() - 1; i++) { + cout << pl[i] << ","; + } + cout << pl[pl.size() - 1] << "\t\t" << site.tmp_readbase_string << endl; } } diff --git a/src/test/placement.test.cpp b/src/test/placement.test.cpp index 69110ef5..e81dd5e3 100644 --- a/src/test/placement.test.cpp +++ b/src/test/placement.test.cpp @@ -1,4 +1,4 @@ -#define BOOST_TEST_MODULE Index construction + #define BOOST_TEST_MODULE Index construction #include #include "../PangenomeMAT.hpp" #include @@ -516,6 +516,8 @@ BOOST_AUTO_TEST_CASE(genotypeUncertainties) { std::string best_match; PangenomeMAT::placeSample(pT, pruned_sample_fastq_1, index, k, s, best_match); + cout << pruned_sample << "\t" << T->allNodes[pruned_sample]->parent->identifier << "\t" << best_match << endl; + const std::string ref_node_seq = T->getStringFromReference(best_match, false); std::string reference_node_fa_path = "../src/test/statsgenotype/fasta/" + best_match + ".fa"; if(!filesystem::exists(reference_node_fa_path)) { @@ -529,16 +531,20 @@ BOOST_AUTO_TEST_CASE(genotypeUncertainties) { } std::string sam_path = "../src/test/statsgenotype/sam/" + pruned_sample + ".sam"; + std::string minimap_cmd_str = "minimap2 -ax sr " + reference_node_fa_path + " " + pruned_sample_fastq_1 + " " + pruned_sample_fastq_2 + " --heap-sort=yes | samtools sort -O sam > " + sam_path; + cout << minimap_cmd_str << endl; if (!filesystem::exists(sam_path)) { - std::string minimap_cmd_str = "minimap2 -ax sr " + reference_node_fa_path + " " + pruned_sample_fastq_1 + " " + pruned_sample_fastq_2 + " --heap-sort=yes | samtools sort -O sam > " + sam_path; const char* minimap_cmd = minimap_cmd_str.c_str(); + cout << minimap_cmd_str << endl; system(minimap_cmd); } std::string plu_path = "../src/test/statsgenotype/pileup/" + pruned_sample + ".pileup"; + std::string pileup_cmd_str = "samtools mpileup " + sam_path + " -f " + reference_node_fa_path + " -A > " + plu_path; + cout << pileup_cmd_str << endl; if (!filesystem::exists(plu_path)) { - std::string pileup_cmd_str = "samtools mpileup " + sam_path + " -f " + reference_node_fa_path + " > " + plu_path; const char* pileup_cmd = pileup_cmd_str.c_str(); + cout << pileup_cmd_str << endl; system(pileup_cmd); } @@ -558,10 +564,27 @@ BOOST_AUTO_TEST_CASE(genotypeUncertainties) { // << best_match << "\t" // << get_distance(T, T->allNodes[best_match], T->allNodes[pruned_sample]->parent) << "\t" // << get_distance_branch(T, T->allNodes[best_match], T->allNodes[pruned_sample]->parent) << endl; - break; - } + std::ifstream testplu("../src/test/statsgenotype/pileup/test/test.pileup"); + std::ifstream min("../mutation_matrices/sars_4k.mutmat"); + T->printSamplePlacementVCF(testplu, &min); + + // std::ifstream ecis("../ecoli_100.pmat"); + // boost::iostreams::filtering_streambuf< boost::iostreams::input> ecinPMATBuffer; + // ecinPMATBuffer.push(boost::iostreams::gzip_decompressor()); + // ecinPMATBuffer.push(ecis); + // std::istream ecinputStream(&ecinPMATBuffer); + // Tree *ecT = new Tree(ecinputStream); + + // cout << "NZ_CP007594.1 -> node_63" << endl; + // Node* sampleNode = ecT->allNodes["NZ_CP007594.1"]; + // cout << sampleNode->parent->identifier << endl; + // cout << sampleNode->nucMutation.size() << endl; + // cout << sampleNode->blockMutation.size() << endl; + // for (const auto& mut : sampleNode->nucMutation) { + // cout << mut.nucPosition << "\t" << (mut.mutInfo >> 4) << "\t" << (mut.mutInfo & 15) << endl; + // } is.close(); } diff --git a/src/test/statsgenotype/pileup/test/test.pileup b/src/test/statsgenotype/pileup/test/test.pileup new file mode 100644 index 00000000..3723bef0 --- /dev/null +++ b/src/test/statsgenotype/pileup/test/test.pileup @@ -0,0 +1,6 @@ +test_ref 21 A 5 CccCC FFFFF +test_ref 96 G 3 .C, FFF +test_ref 100 A 5 ,GTC. FFFFF +test_ref 169 T 4 .-2TT.-2TT..+1G FFFF +test_ref 250 C 5 .-2CG,-2cg.-2CG.-2CG, FFFFF +test_ref 420 G 5 .-2TT.-2TT.-1T.,-2TT FFFFF