Skip to content

Commit

Permalink
save
Browse files Browse the repository at this point in the history
  • Loading branch information
AlanZhangUCSC committed Dec 13, 2023
1 parent 03b5677 commit ccd4761
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 65 deletions.
6 changes: 6 additions & 0 deletions mutation_matrices/sars_4k.mutmat
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
0.000359671 46.6273 44.5609 45.8359
44.8418 0.000986932 48.5409 37.4368
43.3582 47.9474 0.000479415 43.1704
46.9954 42.4279 47.9589 0.00040456
0.000314554 46.3982 52.0032 50.0921 57.1724 56.8862 45.3096
0.000151323 49.717 52.9112 54.0499 58.4771 59.8264 48.9764
147 changes: 87 additions & 60 deletions src/statsgenotype.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -335,15 +335,21 @@ static void printSiteGenotypePosteriors(const variationSite& site) {
}

static void printVCFLine(const statsgenotype::variationSite& site) {
size_t position = site.ref_position + 1;
int ref_nuc_idx = site.site_info >> 3;
size_t readDepth = 0;
vector<string> altAlleles;
vector<size_t> ad;
vector<int> pl;
string refAllele;
vector<string> alleles;
int gt;
vector<size_t> dp;
vector<int> pl;
size_t position;

refAllele += getNucleotideFromIndex(site.site_info >> 3);
position = site.ref_position + 1;
int quality = site.likelihoods[ref_nuc_idx];
for (const auto& depth : site.read_depth) {
readDepth += depth;
}

refAllele += getNucleotideFromIndex(ref_nuc_idx);

// find longest deletion
size_t ldl = 0; // longest deletion length
Expand All @@ -356,61 +362,82 @@ static void printVCFLine(const statsgenotype::variationSite& site) {
}

if (!lds.empty()) {
defAllele += lds;
refAllele += lds;
}

// string refAllele;
// string altAllele;
// size_t position;

// refAllele += getNucleotideFromIndex(site.site_info >> 3);
// position = site.ref_position + 1;

// bool no_print = true;
// for (int i = 0; i < site.posteriors.size(); i++) {
// if (i != (site.site_info >> 3) && site.posteriors[i] != numeric_limits<double>::max()) {
// no_print = false;
// }
// }
// if (no_print || site.most_probable_idx == (site.site_info >> 3)) {
// return;
// }

// if (site.most_probable_idx <= 3) {
// altAllele += getNucleotideFromIndex(site.most_probable_idx);
// } else {
// if (site.most_probable_idx <= 3 + site.insertions.size()) {
// int idx = 0;
// for (const auto& ins : site.insertions) {
// if (idx == site.most_probable_idx - 4) {
// altAllele += (refAllele + ins.first);
// }
// idx += 1;
// }
// } else {
// int idx = 0;
// for (const auto& del : site.deletions) {
// if (idx == site.most_probable_idx - site.insertions.size() - 4) {
// altAllele = refAllele;
// refAllele += del.first;
// }
// idx += 1;
// }
// }
// }



cout << "ref" << "\t" // #CHROM
<< position << "\t" // POS
<< "." << "\t" // ID
<< refAllele << "\t" // REF
<< altAllele << "\t" // ALT
<< "." << "\t" // QUAL
<< "." << "\t" // FILTER
<< "." << "\t" // INFO
<< "GT:DP:PL" << "\t" // FORMAT
<< "sample" << endl; // SAMPLE

// depth and likelihood for reference
ad.push_back(site.read_depth[ref_nuc_idx]);
pl.push_back(site.posteriors[ref_nuc_idx]);
if (pl.back() == 0.0) {
gt = 0;
}

// substitutions
for (int i = 0; i < 4; i++) {
if (i == ref_nuc_idx) {
continue;
}
if (site.posteriors[i] != numeric_limits<double>::max()) {
altAlleles.push_back(getNucleotideFromIndex(i) + lds);
ad.push_back(site.read_depth[i]);
pl.push_back(site.posteriors[i]);
if (pl.back() == 0.0) {
gt = altAlleles.size();
}
}
}

// insertions
size_t indelIdx = 0;
for (const auto& ins : site.insertions) {
altAlleles.push_back(getNucleotideFromIndex(ref_nuc_idx) + ins.first + lds);
ad.push_back(site.read_depth[4 + indelIdx]);
pl.push_back(site.posteriors[4 + indelIdx]);
if (pl.back() == 0.0) {
gt = altAlleles.size();
}
indelIdx += 1;
}

// deletions
for (const auto& del : site.deletions) {
size_t delSize = del.first.size();
if (delSize == ldl) {
altAlleles.push_back(refAllele.substr(0, 1));
} else {
altAlleles.push_back(getNucleotideFromIndex(ref_nuc_idx) + lds.substr(delSize, ldl - delSize));
}
ad.push_back(site.read_depth[4 + indelIdx]);
pl.push_back(site.posteriors[4 + indelIdx]);
if (pl.back() == 0.0) {
gt = altAlleles.size();
}
indelIdx += 1;
}



cout << "ref" << "\t" // #CHROM
<< position << "\t" // POS
<< "." << "\t" // ID
<< refAllele << "\t"; // REF
for (size_t i = 0; i < altAlleles.size() - 1; i++) {
cout << altAlleles[i] << ",";
}
cout << altAlleles[altAlleles.size() - 1] << "\t"; // ALT
cout << quality << "\t" // QUAL
<< "." << "\t" // FILTER
<< "DP=" << readDepth << "\t" // INFO
<< "GT:AD:PL" << "\t" // FORMAT
<< gt << ":"; // SAMPLE
for (size_t i = 0; i < ad.size() - 1; i++) {
cout << ad[i] << ",";
}
cout << ad[ad.size() - 1] << ":";
for (size_t i = 0; i < pl.size() - 1; i++) {
cout << pl[i] << ",";
}
cout << pl[pl.size() - 1] << "\t\t" << site.tmp_readbase_string << endl;
}

}
Expand Down
33 changes: 28 additions & 5 deletions src/test/placement.test.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#define BOOST_TEST_MODULE Index construction
#define BOOST_TEST_MODULE Index construction
#include <boost/test/included/unit_test.hpp>
#include "../PangenomeMAT.hpp"
#include <stack>
Expand Down Expand Up @@ -516,6 +516,8 @@ BOOST_AUTO_TEST_CASE(genotypeUncertainties) {
std::string best_match;
PangenomeMAT::placeSample(pT, pruned_sample_fastq_1, index, k, s, best_match);

cout << pruned_sample << "\t" << T->allNodes[pruned_sample]->parent->identifier << "\t" << best_match << endl;

const std::string ref_node_seq = T->getStringFromReference(best_match, false);
std::string reference_node_fa_path = "../src/test/statsgenotype/fasta/" + best_match + ".fa";
if(!filesystem::exists(reference_node_fa_path)) {
Expand All @@ -529,16 +531,20 @@ BOOST_AUTO_TEST_CASE(genotypeUncertainties) {
}

std::string sam_path = "../src/test/statsgenotype/sam/" + pruned_sample + ".sam";
std::string minimap_cmd_str = "minimap2 -ax sr " + reference_node_fa_path + " " + pruned_sample_fastq_1 + " " + pruned_sample_fastq_2 + " --heap-sort=yes | samtools sort -O sam > " + sam_path;
cout << minimap_cmd_str << endl;
if (!filesystem::exists(sam_path)) {
std::string minimap_cmd_str = "minimap2 -ax sr " + reference_node_fa_path + " " + pruned_sample_fastq_1 + " " + pruned_sample_fastq_2 + " --heap-sort=yes | samtools sort -O sam > " + sam_path;
const char* minimap_cmd = minimap_cmd_str.c_str();
cout << minimap_cmd_str << endl;
system(minimap_cmd);
}

std::string plu_path = "../src/test/statsgenotype/pileup/" + pruned_sample + ".pileup";
std::string pileup_cmd_str = "samtools mpileup " + sam_path + " -f " + reference_node_fa_path + " -A > " + plu_path;
cout << pileup_cmd_str << endl;
if (!filesystem::exists(plu_path)) {
std::string pileup_cmd_str = "samtools mpileup " + sam_path + " -f " + reference_node_fa_path + " > " + plu_path;
const char* pileup_cmd = pileup_cmd_str.c_str();
cout << pileup_cmd_str << endl;
system(pileup_cmd);
}

Expand All @@ -558,10 +564,27 @@ BOOST_AUTO_TEST_CASE(genotypeUncertainties) {
// << best_match << "\t"
// << get_distance(T, T->allNodes[best_match], T->allNodes[pruned_sample]->parent) << "\t"
// << get_distance_branch(T, T->allNodes[best_match], T->allNodes[pruned_sample]->parent) << endl;

break;

}
std::ifstream testplu("../src/test/statsgenotype/pileup/test/test.pileup");
std::ifstream min("../mutation_matrices/sars_4k.mutmat");
T->printSamplePlacementVCF(testplu, &min);

// std::ifstream ecis("../ecoli_100.pmat");
// boost::iostreams::filtering_streambuf< boost::iostreams::input> ecinPMATBuffer;
// ecinPMATBuffer.push(boost::iostreams::gzip_decompressor());
// ecinPMATBuffer.push(ecis);
// std::istream ecinputStream(&ecinPMATBuffer);
// Tree *ecT = new Tree(ecinputStream);

// cout << "NZ_CP007594.1 -> node_63" << endl;
// Node* sampleNode = ecT->allNodes["NZ_CP007594.1"];
// cout << sampleNode->parent->identifier << endl;
// cout << sampleNode->nucMutation.size() << endl;
// cout << sampleNode->blockMutation.size() << endl;
// for (const auto& mut : sampleNode->nucMutation) {
// cout << mut.nucPosition << "\t" << (mut.mutInfo >> 4) << "\t" << (mut.mutInfo & 15) << endl;
// }

is.close();
}
Expand Down
6 changes: 6 additions & 0 deletions src/test/statsgenotype/pileup/test/test.pileup
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
test_ref 21 A 5 CccCC FFFFF
test_ref 96 G 3 .C, FFF
test_ref 100 A 5 ,GTC. FFFFF
test_ref 169 T 4 .-2TT.-2TT..+1G FFFF
test_ref 250 C 5 .-2CG,-2cg.-2CG.-2CG, FFFFF
test_ref 420 G 5 .-2TT.-2TT.-1T.,-2TT FFFFF

0 comments on commit ccd4761

Please sign in to comment.