Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Option to annotate a representative node from each monochromatic contig #500

Open
wants to merge 38 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
c842ee8
Fix for queries with invalid characters
hmusta Oct 8, 2024
dbc0cb8
indicate colour breakpoints to support monotig graphs with sshash
hmusta Oct 8, 2024
0f91011
simplify
hmusta Oct 8, 2024
12b4b91
fix label generation
hmusta Oct 8, 2024
5860d15
Update metagraph/tests/graph/all/test_dbg_helpers.cpp
hmusta Oct 8, 2024
4a483c2
fix for primary mode sshash graph
hmusta Oct 8, 2024
631fcfb
cleanup
hmusta Oct 8, 2024
5847724
Merge branch 'sshash_query_fixes' into sshash_contig_anno
hmusta Oct 8, 2024
f38fa93
minor
hmusta Oct 8, 2024
ac4ba1f
fixes
hmusta Oct 8, 2024
6889ce0
minor
hmusta Oct 8, 2024
94af0e1
Merge branch 'sshash_query_fixes' into sshash_contig_anno
hmusta Oct 8, 2024
ae8c579
minor
hmusta Oct 8, 2024
9da9ebc
no duplicate coords
hmusta Oct 8, 2024
c7d2469
update and simplify AnnotationBuffer
hmusta Oct 8, 2024
2647221
fix boundary annotation
hmusta Oct 8, 2024
d9a8f1c
minor
hmusta Oct 8, 2024
da10981
Merge remote-tracking branch 'origin/master' into sshash_query_fixes
hmusta Oct 8, 2024
54280b5
Merge branch 'sshash_query_fixes' into sshash_contig_anno
hmusta Oct 8, 2024
3e44b8c
Update metagraph/src/graph/representation/hash/dbg_sshash.cpp
hmusta Oct 8, 2024
6a6fd0a
refactor
hmusta Oct 8, 2024
13fe52f
Merge branch 'sshash_query_fixes' into sshash_contig_anno
hmusta Oct 8, 2024
85af373
fix
hmusta Oct 8, 2024
844e4e1
addressed reviewer comments
hmusta Oct 8, 2024
fca3569
auto-deduce dict type
hmusta Oct 8, 2024
cd03ed2
Update metagraph/src/graph/representation/hash/dbg_sshash.cpp
adamant-pwn Oct 8, 2024
a41ac56
Merge branch 'sshash_query_fixes' into sshash_contig_anno
hmusta Oct 8, 2024
a520da2
Merge remote-tracking branch 'origin/sshash_query_fixes' into sshash_…
hmusta Oct 8, 2024
2ec61d6
minor
hmusta Oct 8, 2024
7081a74
Merge remote-tracking branch 'origin/master' into sshash_contig_anno
hmusta Oct 8, 2024
3000911
simplify
hmusta Oct 8, 2024
bd23e1f
write colour information to fasta in monochromaticity tests
hmusta Oct 9, 2024
f599fec
remove invalid asserts
hmusta Oct 10, 2024
c41142a
added DeBruijnGraph::map_to_contigs, refactoring
hmusta Oct 10, 2024
a7a5741
add functions for mapping nodes to the first and last node of a contig
hmusta Oct 11, 2024
a0ca81d
renamed AnnotatedSequenceGraph::get_labels to get_stored_labels
hmusta Oct 11, 2024
97bad90
parallel file parsing
hmusta Oct 11, 2024
64c4e12
minor
hmusta Oct 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion metagraph/src/cli/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,12 +251,16 @@ int build_graph(Config *config) {
}

} else if (config->graph_type == Config::GraphType::SSHASH && !config->dynamic) {
graph.reset(new DBGSSHash(files.at(0), config->k, config->graph_mode, config->num_chars));
if (files.size() > 1) {
logger->error("DBGSSHash does not support multiple input files.");
exit(1);
}

graph.reset(new DBGSSHash(files.at(0),
config->k,
config->graph_mode,
config->num_chars,
config->is_monochromatic));
} else {
//slower method
switch (config->graph_type) {
Expand Down
3 changes: 3 additions & 0 deletions metagraph/src/cli/config/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ Config::Config(int argc, char *argv[]) {
dynamic = true;
} else if (!strcmp(argv[i], "--mask-dummy")) {
mark_dummy_kmers = true;
} else if (!strcmp(argv[i], "--is-monochromatic")) {
is_monochromatic = true;
} else if (!strcmp(argv[i], "--anno-filename")) {
filename_anno = true;
} else if (!strcmp(argv[i], "--anno-header")) {
Expand Down Expand Up @@ -972,6 +974,7 @@ if (advanced) {
fprintf(stderr, "\t --mode \t\tk-mer indexing mode: basic / canonical / primary [basic]\n");
#endif
fprintf(stderr, "\t --complete \t\tconstruct a complete graph (only for Bitmap graph) [off]\n");
fprintf(stderr, "\t --is-monochromatic \t\tindicate that the input sequences are monochromatic (i.e., their colouring is constant) [off]\n");
fprintf(stderr, "\t --mem-cap-gb [INT] \tpreallocated buffer size in GB [1]\n");
if (advanced) {
fprintf(stderr, "\t --dynamic \t\tuse dynamic build method [off]\n");
Expand Down
1 change: 1 addition & 0 deletions metagraph/src/cli/config/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class Config {
bool complete = false;
bool dynamic = false;
bool mark_dummy_kmers = false;
bool is_monochromatic = false;
bool filename_anno = false;
bool annotate_sequence_headers = false;
bool to_adj_list = false;
Expand Down
5 changes: 3 additions & 2 deletions metagraph/src/cli/query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -955,8 +955,9 @@ construct_query_graph(const AnnotatedDBG &anno_graph,
#pragma omp parallel for num_threads(num_threads)
for (size_t i = 0; i < contigs.size(); ++i) {
contigs[i].second.reserve(contigs[i].first.length() - graph_init->get_k() + 1);
full_dbg.map_to_nodes(contigs[i].first,
[&](node_index node) { contigs[i].second.push_back(node); });
call_annotated_nodes_offsets(full_dbg, contigs[i].first, [&](node_index node, int64_t) {
contigs[i].second.push_back(node);
});
}
logger->trace("[Query graph construction] Contigs mapped to the full graph in {} sec",
timer.elapsed());
Expand Down
162 changes: 74 additions & 88 deletions metagraph/src/graph/alignment/annotation_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "graph/representation/rc_dbg.hpp"
#include "graph/representation/succinct/dbg_succinct.hpp"
#include "graph/representation/hash/dbg_sshash.hpp"
#include "graph/representation/canonical_dbg.hpp"
#include "annotation/binary_matrix/base/binary_matrix.hpp"
#include "common/utils/template_utils.hpp"
Expand Down Expand Up @@ -34,8 +35,6 @@ AnnotationBuffer::AnnotationBuffer(const DeBruijnGraph &graph, const Annotator &
void AnnotationBuffer::fetch_queued_annotations() {
assert(graph_.get_mode() != DeBruijnGraph::PRIMARY
&& "PRIMARY graphs must be wrapped into CANONICAL");

std::vector<node_index> queued_nodes;
std::vector<Row> queued_rows;

const DeBruijnGraph *base_graph = &graph_;
Expand All @@ -45,14 +44,21 @@ void AnnotationBuffer::fetch_queued_annotations() {

const auto *dbg_succ = dynamic_cast<const DBGSuccinct*>(base_graph);
const boss::BOSS *boss = dbg_succ ? &dbg_succ->get_boss() : nullptr;
const DBGSSHash *sshash = dynamic_cast<const DBGSSHash*>(base_graph);

std::vector<std::tuple<node_index, node_index, int64_t>> to_update;
for (const auto &path : queued_paths_) {
std::vector<node_index> base_path;
if (base_graph->get_mode() == DeBruijnGraph::CANONICAL) {
std::vector<int64_t> base_path_offsets;
if (base_graph->get_mode() == DeBruijnGraph::CANONICAL || (sshash && sshash->is_monochromatic())) {
// TODO: avoid this call of spell_path
std::string query = spell_path(graph_, path);
base_path = map_to_nodes(*base_graph, query);

base_path.reserve(path.size());
call_annotated_nodes_offsets(graph_, query, [&](node_index i, int64_t o) {
assert(boss || i != DeBruijnGraph::npos);
base_path.emplace_back(i);
base_path_offsets.emplace_back(o);
});
} else if (canonical_) {
base_path.reserve(path.size());
for (node_index node : path) {
Expand All @@ -66,125 +72,108 @@ void AnnotationBuffer::fetch_queued_annotations() {
std::reverse(base_path.begin(), base_path.end());
}

base_path_offsets.resize(base_path.size());

assert(base_path.size() == path.size());

for (size_t i = 0; i < path.size(); ++i) {
if (base_path[i] == DeBruijnGraph::npos) {
if (base_path[i] == DeBruijnGraph::npos
|| (boss && !boss->get_W(dbg_succ->kmer_to_boss_index(base_path[i])))) {
// this can happen when the base graph is CANONICAL and path[i] is a
// dummy node
if (node_to_cols_.try_emplace(path[i], 0).second && has_coordinates())
label_coords_.emplace_back();

continue;
}

if (boss && !boss->get_W(dbg_succ->kmer_to_boss_index(base_path[i]))) {
// skip dummy nodes
if (node_to_cols_.try_emplace(base_path[i], 0).second && has_coordinates())
label_coords_.emplace_back();

if (graph_.get_mode() == DeBruijnGraph::CANONICAL
&& base_path[i] != path[i]
&& node_to_cols_.emplace(path[i], 0).second && has_coordinates()) {
if (node_to_cols_.try_emplace(path[i], 0).second && has_coordinates())
label_coords_.emplace_back();
}

continue;
}

Row row = AnnotatedDBG::graph_to_anno_index(base_path[i]);
if (canonical_ || graph_.get_mode() == DeBruijnGraph::BASIC) {
if (node_to_cols_.try_emplace(base_path[i], nannot).second) {
queued_rows.push_back(row);
queued_nodes.push_back(base_path[i]);
}
to_update.emplace_back(base_path[i], path[i], base_path_offsets[i]);

continue;
}

assert(graph_.get_mode() == DeBruijnGraph::CANONICAL);

auto find_a = node_to_cols_.find(path[i]);
auto find_b = node_to_cols_.find(base_path[i]);

if (find_a == node_to_cols_.end() && find_b == node_to_cols_.end()) {
node_to_cols_.try_emplace(path[i], nannot);
Row row = AnnotatedDBG::graph_to_anno_index(base_path[i]);
if (node_to_cols_.try_emplace(base_path[i], nannot).second) {
queued_rows.push_back(row);
queued_nodes.push_back(path[i]);

if (path[i] != base_path[i]) {
node_to_cols_.emplace(base_path[i], nannot);
queued_rows.push_back(row);
queued_nodes.push_back(base_path[i]);
}
} else if (find_a == node_to_cols_.end() && find_b != node_to_cols_.end()) {
node_to_cols_.try_emplace(path[i], find_b->second);
if (find_b->second == nannot) {
queued_rows.push_back(row);
queued_nodes.push_back(path[i]);
}
} else if (find_a != node_to_cols_.end() && find_b == node_to_cols_.end()) {
node_to_cols_.try_emplace(base_path[i], find_a->second);
} else {
size_t label_i = std::min(find_a->second, find_b->second);
if (label_i != nannot) {
find_a.value() = label_i;
find_b.value() = label_i;
}
if (has_coordinates())
label_coords_.emplace_back();
}
}
}

queued_paths_.clear();

if (queued_nodes.empty())
return;
assert(!has_coordinates() || node_to_cols_.size() == label_coords_.size());

auto push_node_labels = [&](auto node_it, auto row_it, auto&& labels) {
assert(node_it != queued_nodes.end());
assert(node_to_cols_.count(*node_it));
assert(node_to_cols_.count(AnnotatedDBG::anno_to_graph_index(*row_it)));

size_t label_i = cache_column_set(std::move(labels));
node_index base_node = AnnotatedDBG::anno_to_graph_index(*row_it);
if (graph_.get_mode() == DeBruijnGraph::BASIC) {
assert(base_node == *node_it);
node_to_cols_[*node_it] = label_i;
} else if (canonical_) {
node_to_cols_[base_node] = label_i;
} else {
node_to_cols_[*node_it] = label_i;
if (base_node != *node_it && node_to_cols_.try_emplace(base_node, label_i).second
&& has_coordinates()) {
label_coords_.emplace_back(label_coords_.back());
}
}
};
queued_paths_.clear();

auto node_it = queued_nodes.begin();
auto row_it = queued_rows.begin();
if (has_coordinates()) {
assert(multi_int_);
// extract both labels and coordinates, then store them separately
for (auto&& row_tuples : multi_int_->get_row_tuples(queued_rows)) {
assert(row_it != queued_rows.end());
std::sort(row_tuples.begin(), row_tuples.end(), utils::LessFirst());

node_index base_node = AnnotatedDBG::anno_to_graph_index(*row_it);
auto find_base = node_to_cols_.find(base_node);
assert(find_base != node_to_cols_.end());
assert(find_base->second == nannot);

size_t coord_idx = find_base - node_to_cols_.begin();
assert(coord_idx < label_coords_.size());

Columns labels;
labels.reserve(row_tuples.size());
label_coords_.emplace_back();
label_coords_.back().reserve(row_tuples.size());
auto &label_coords = label_coords_[coord_idx];
label_coords.reserve(row_tuples.size());
for (auto&& [label, coords] : row_tuples) {
labels.push_back(label);
label_coords_.back().emplace_back(coords.begin(), coords.end());
label_coords.emplace_back(coords.begin(), coords.end());
}
push_node_labels(node_it++, row_it++, std::move(labels));

find_base.value() = cache_column_set(std::move(labels));

++row_it;
}
} else {
for (auto&& labels : annotator_.get_matrix().get_rows(queued_rows)) {
assert(row_it != queued_rows.end());

std::sort(labels.begin(), labels.end());
push_node_labels(node_it++, row_it++, std::move(labels));

node_index base_node = AnnotatedDBG::anno_to_graph_index(*row_it);
auto find_base = node_to_cols_.find(base_node);
assert(find_base != node_to_cols_.end());
assert(find_base->second == nannot);

find_base.value() = cache_column_set(std::move(labels));

++row_it;
}
}

for (const auto &[base_node, node, offset] : to_update) {
auto find_base = node_to_cols_.find(base_node);
assert(find_base != node_to_cols_.end());
assert(find_base->second != nannot);

size_t coord_idx = find_base - node_to_cols_.begin();
size_t label_i = find_base->second;

assert(!node_to_cols_.count(node) || node_to_cols_.find(node)->second != nannot);

if (node_to_cols_.try_emplace(node, label_i).second && has_coordinates()) {
assert(coord_idx < label_coords_.size());
label_coords_.emplace_back(label_coords_[coord_idx]);
for (auto &coords : label_coords_.back()) {
for (auto &c : coords) {
c += offset;
}
}
}
}

assert(!has_coordinates() || node_to_cols_.size() == label_coords_.size());

#ifndef NDEBUG
for (const auto &[node, val] : node_to_cols_) {
assert(val != nannot);
Expand All @@ -196,9 +185,6 @@ auto AnnotationBuffer::get_labels_and_coords(node_index node) const
-> std::pair<const Columns*, const CoordinateSet*> {
std::pair<const Columns*, const CoordinateSet*> ret_val { nullptr, nullptr };

if (canonical_)
node = canonical_->get_base_node(node);

auto it = node_to_cols_.find(node);

// if the node hasn't been seen before, or if its annotations haven't
Expand Down
Loading
Loading