diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp index d58cbde119..6a193d7214 100644 --- a/metagraph/src/cli/build.cpp +++ b/metagraph/src/cli/build.cpp @@ -251,12 +251,16 @@ int build_graph(Config *config) { } } else if (config->graph_type == Config::GraphType::SSHASH && !config->dynamic) { - graph.reset(new DBGSSHash(files.at(0), config->k, config->graph_mode, config->num_chars)); if (files.size() > 1) { logger->error("DBGSSHash does not support multiple input files."); exit(1); } + graph.reset(new DBGSSHash(files.at(0), + config->k, + config->graph_mode, + config->num_chars, + config->is_monochromatic)); } else { //slower method switch (config->graph_type) { diff --git a/metagraph/src/cli/config/config.cpp b/metagraph/src/cli/config/config.cpp index 7312ff93ba..fd81cfd26b 100644 --- a/metagraph/src/cli/config/config.cpp +++ b/metagraph/src/cli/config/config.cpp @@ -151,6 +151,8 @@ Config::Config(int argc, char *argv[]) { dynamic = true; } else if (!strcmp(argv[i], "--mask-dummy")) { mark_dummy_kmers = true; + } else if (!strcmp(argv[i], "--is-monochromatic")) { + is_monochromatic = true; } else if (!strcmp(argv[i], "--anno-filename")) { filename_anno = true; } else if (!strcmp(argv[i], "--anno-header")) { @@ -972,6 +974,7 @@ if (advanced) { fprintf(stderr, "\t --mode \t\tk-mer indexing mode: basic / canonical / primary [basic]\n"); #endif fprintf(stderr, "\t --complete \t\tconstruct a complete graph (only for Bitmap graph) [off]\n"); + fprintf(stderr, "\t --is-monochromatic \t\tindicate that the input sequences are monochromatic (i.e., their colouring is constant) [off]\n"); fprintf(stderr, "\t --mem-cap-gb [INT] \tpreallocated buffer size in GB [1]\n"); if (advanced) { fprintf(stderr, "\t --dynamic \t\tuse dynamic build method [off]\n"); diff --git a/metagraph/src/cli/config/config.hpp b/metagraph/src/cli/config/config.hpp index b4848812d8..aae5304270 100644 --- a/metagraph/src/cli/config/config.hpp +++ b/metagraph/src/cli/config/config.hpp @@ -28,6 +28,7 @@ class Config { bool complete = false; bool dynamic = false; bool mark_dummy_kmers = false; + bool is_monochromatic = false; bool filename_anno = false; bool annotate_sequence_headers = false; bool to_adj_list = false; diff --git a/metagraph/src/cli/query.cpp b/metagraph/src/cli/query.cpp index cb7d2e35c9..fcb41c61b5 100644 --- a/metagraph/src/cli/query.cpp +++ b/metagraph/src/cli/query.cpp @@ -955,8 +955,9 @@ construct_query_graph(const AnnotatedDBG &anno_graph, #pragma omp parallel for num_threads(num_threads) for (size_t i = 0; i < contigs.size(); ++i) { contigs[i].second.reserve(contigs[i].first.length() - graph_init->get_k() + 1); - full_dbg.map_to_nodes(contigs[i].first, - [&](node_index node) { contigs[i].second.push_back(node); }); + call_annotated_nodes_offsets(full_dbg, contigs[i].first, [&](node_index node, int64_t) { + contigs[i].second.push_back(node); + }); } logger->trace("[Query graph construction] Contigs mapped to the full graph in {} sec", timer.elapsed()); diff --git a/metagraph/src/graph/alignment/annotation_buffer.cpp b/metagraph/src/graph/alignment/annotation_buffer.cpp index 4020f312a7..8870b71e0b 100644 --- a/metagraph/src/graph/alignment/annotation_buffer.cpp +++ b/metagraph/src/graph/alignment/annotation_buffer.cpp @@ -2,6 +2,7 @@ #include "graph/representation/rc_dbg.hpp" #include "graph/representation/succinct/dbg_succinct.hpp" +#include "graph/representation/hash/dbg_sshash.hpp" #include "graph/representation/canonical_dbg.hpp" #include "annotation/binary_matrix/base/binary_matrix.hpp" #include "common/utils/template_utils.hpp" @@ -34,8 +35,6 @@ AnnotationBuffer::AnnotationBuffer(const DeBruijnGraph &graph, const Annotator & void AnnotationBuffer::fetch_queued_annotations() { assert(graph_.get_mode() != DeBruijnGraph::PRIMARY && "PRIMARY graphs must be wrapped into CANONICAL"); - - std::vector queued_nodes; std::vector queued_rows; const DeBruijnGraph *base_graph = &graph_; @@ -45,14 +44,21 @@ void AnnotationBuffer::fetch_queued_annotations() { const auto *dbg_succ = dynamic_cast(base_graph); const boss::BOSS *boss = dbg_succ ? &dbg_succ->get_boss() : nullptr; + const DBGSSHash *sshash = dynamic_cast(base_graph); + std::vector> to_update; for (const auto &path : queued_paths_) { std::vector base_path; - if (base_graph->get_mode() == DeBruijnGraph::CANONICAL) { + std::vector base_path_offsets; + if (base_graph->get_mode() == DeBruijnGraph::CANONICAL || (sshash && sshash->is_monochromatic())) { // TODO: avoid this call of spell_path std::string query = spell_path(graph_, path); - base_path = map_to_nodes(*base_graph, query); - + base_path.reserve(path.size()); + call_annotated_nodes_offsets(graph_, query, [&](node_index i, int64_t o) { + assert(boss || i != DeBruijnGraph::npos); + base_path.emplace_back(i); + base_path_offsets.emplace_back(o); + }); } else if (canonical_) { base_path.reserve(path.size()); for (node_index node : path) { @@ -66,125 +72,108 @@ void AnnotationBuffer::fetch_queued_annotations() { std::reverse(base_path.begin(), base_path.end()); } + base_path_offsets.resize(base_path.size()); + assert(base_path.size() == path.size()); for (size_t i = 0; i < path.size(); ++i) { - if (base_path[i] == DeBruijnGraph::npos) { + if (base_path[i] == DeBruijnGraph::npos + || (boss && !boss->get_W(dbg_succ->kmer_to_boss_index(base_path[i])))) { // this can happen when the base graph is CANONICAL and path[i] is a // dummy node - if (node_to_cols_.try_emplace(path[i], 0).second && has_coordinates()) - label_coords_.emplace_back(); - - continue; - } - - if (boss && !boss->get_W(dbg_succ->kmer_to_boss_index(base_path[i]))) { - // skip dummy nodes if (node_to_cols_.try_emplace(base_path[i], 0).second && has_coordinates()) label_coords_.emplace_back(); - if (graph_.get_mode() == DeBruijnGraph::CANONICAL - && base_path[i] != path[i] - && node_to_cols_.emplace(path[i], 0).second && has_coordinates()) { + if (node_to_cols_.try_emplace(path[i], 0).second && has_coordinates()) label_coords_.emplace_back(); - } continue; } - Row row = AnnotatedDBG::graph_to_anno_index(base_path[i]); - if (canonical_ || graph_.get_mode() == DeBruijnGraph::BASIC) { - if (node_to_cols_.try_emplace(base_path[i], nannot).second) { - queued_rows.push_back(row); - queued_nodes.push_back(base_path[i]); - } + to_update.emplace_back(base_path[i], path[i], base_path_offsets[i]); - continue; - } - - assert(graph_.get_mode() == DeBruijnGraph::CANONICAL); - - auto find_a = node_to_cols_.find(path[i]); - auto find_b = node_to_cols_.find(base_path[i]); - - if (find_a == node_to_cols_.end() && find_b == node_to_cols_.end()) { - node_to_cols_.try_emplace(path[i], nannot); + Row row = AnnotatedDBG::graph_to_anno_index(base_path[i]); + if (node_to_cols_.try_emplace(base_path[i], nannot).second) { queued_rows.push_back(row); - queued_nodes.push_back(path[i]); - - if (path[i] != base_path[i]) { - node_to_cols_.emplace(base_path[i], nannot); - queued_rows.push_back(row); - queued_nodes.push_back(base_path[i]); - } - } else if (find_a == node_to_cols_.end() && find_b != node_to_cols_.end()) { - node_to_cols_.try_emplace(path[i], find_b->second); - if (find_b->second == nannot) { - queued_rows.push_back(row); - queued_nodes.push_back(path[i]); - } - } else if (find_a != node_to_cols_.end() && find_b == node_to_cols_.end()) { - node_to_cols_.try_emplace(base_path[i], find_a->second); - } else { - size_t label_i = std::min(find_a->second, find_b->second); - if (label_i != nannot) { - find_a.value() = label_i; - find_b.value() = label_i; - } + if (has_coordinates()) + label_coords_.emplace_back(); } } } - queued_paths_.clear(); - - if (queued_nodes.empty()) - return; + assert(!has_coordinates() || node_to_cols_.size() == label_coords_.size()); - auto push_node_labels = [&](auto node_it, auto row_it, auto&& labels) { - assert(node_it != queued_nodes.end()); - assert(node_to_cols_.count(*node_it)); - assert(node_to_cols_.count(AnnotatedDBG::anno_to_graph_index(*row_it))); - - size_t label_i = cache_column_set(std::move(labels)); - node_index base_node = AnnotatedDBG::anno_to_graph_index(*row_it); - if (graph_.get_mode() == DeBruijnGraph::BASIC) { - assert(base_node == *node_it); - node_to_cols_[*node_it] = label_i; - } else if (canonical_) { - node_to_cols_[base_node] = label_i; - } else { - node_to_cols_[*node_it] = label_i; - if (base_node != *node_it && node_to_cols_.try_emplace(base_node, label_i).second - && has_coordinates()) { - label_coords_.emplace_back(label_coords_.back()); - } - } - }; + queued_paths_.clear(); - auto node_it = queued_nodes.begin(); auto row_it = queued_rows.begin(); if (has_coordinates()) { assert(multi_int_); // extract both labels and coordinates, then store them separately for (auto&& row_tuples : multi_int_->get_row_tuples(queued_rows)) { + assert(row_it != queued_rows.end()); std::sort(row_tuples.begin(), row_tuples.end(), utils::LessFirst()); + + node_index base_node = AnnotatedDBG::anno_to_graph_index(*row_it); + auto find_base = node_to_cols_.find(base_node); + assert(find_base != node_to_cols_.end()); + assert(find_base->second == nannot); + + size_t coord_idx = find_base - node_to_cols_.begin(); + assert(coord_idx < label_coords_.size()); + Columns labels; labels.reserve(row_tuples.size()); - label_coords_.emplace_back(); - label_coords_.back().reserve(row_tuples.size()); + auto &label_coords = label_coords_[coord_idx]; + label_coords.reserve(row_tuples.size()); for (auto&& [label, coords] : row_tuples) { labels.push_back(label); - label_coords_.back().emplace_back(coords.begin(), coords.end()); + label_coords.emplace_back(coords.begin(), coords.end()); } - push_node_labels(node_it++, row_it++, std::move(labels)); + + find_base.value() = cache_column_set(std::move(labels)); + + ++row_it; } } else { for (auto&& labels : annotator_.get_matrix().get_rows(queued_rows)) { + assert(row_it != queued_rows.end()); + std::sort(labels.begin(), labels.end()); - push_node_labels(node_it++, row_it++, std::move(labels)); + + node_index base_node = AnnotatedDBG::anno_to_graph_index(*row_it); + auto find_base = node_to_cols_.find(base_node); + assert(find_base != node_to_cols_.end()); + assert(find_base->second == nannot); + + find_base.value() = cache_column_set(std::move(labels)); + + ++row_it; } } + for (const auto &[base_node, node, offset] : to_update) { + auto find_base = node_to_cols_.find(base_node); + assert(find_base != node_to_cols_.end()); + assert(find_base->second != nannot); + + size_t coord_idx = find_base - node_to_cols_.begin(); + size_t label_i = find_base->second; + + assert(!node_to_cols_.count(node) || node_to_cols_.find(node)->second != nannot); + + if (node_to_cols_.try_emplace(node, label_i).second && has_coordinates()) { + assert(coord_idx < label_coords_.size()); + label_coords_.emplace_back(label_coords_[coord_idx]); + for (auto &coords : label_coords_.back()) { + for (auto &c : coords) { + c += offset; + } + } + } + } + + assert(!has_coordinates() || node_to_cols_.size() == label_coords_.size()); + #ifndef NDEBUG for (const auto &[node, val] : node_to_cols_) { assert(val != nannot); @@ -196,9 +185,6 @@ auto AnnotationBuffer::get_labels_and_coords(node_index node) const -> std::pair { std::pair ret_val { nullptr, nullptr }; - if (canonical_) - node = canonical_->get_base_node(node); - auto it = node_to_cols_.find(node); // if the node hasn't been seen before, or if its annotations haven't diff --git a/metagraph/src/graph/annotated_dbg.cpp b/metagraph/src/graph/annotated_dbg.cpp index 430365365a..9efbdd3bfe 100644 --- a/metagraph/src/graph/annotated_dbg.cpp +++ b/metagraph/src/graph/annotated_dbg.cpp @@ -6,6 +6,7 @@ #include "annotation/representation/row_compressed/annotate_row_compressed.hpp" #include "annotation/int_matrix/base/int_matrix.hpp" #include "graph/representation/canonical_dbg.hpp" +#include "graph/representation/hash/dbg_sshash.hpp" #include "common/aligned_vector.hpp" #include "common/vectors/vector_algorithm.hpp" #include "common/vector_map.hpp" @@ -23,6 +24,19 @@ using Column = mtg::annot::matrix::BinaryMatrix::Column; typedef AnnotatedDBG::Label Label; typedef std::pair StringCountPair; +void call_annotated_nodes_offsets(const SequenceGraph &graph, + std::string_view sequence, + const std::function &callback) { + if (const auto *dbg = dynamic_cast(&graph)) { + if (dbg->is_monochromatic()) { + dbg->map_to_contigs(sequence, callback); + return; + } + } + + graph.map_to_nodes(sequence, [&](SequenceGraph::node_index i) { callback(i, 0); }); +} + AnnotatedSequenceGraph ::AnnotatedSequenceGraph(std::shared_ptr graph, @@ -46,10 +60,9 @@ ::annotate_sequence(std::string_view sequence, std::vector indices; indices.reserve(sequence.size()); - - graph_->map_to_nodes(sequence, [&](node_index i) { + call_annotated_nodes_offsets(*graph_, sequence, [&](node_index i, int64_t) { if (i > 0) - indices.push_back(graph_to_anno_index(i)); + indices.emplace_back(graph_to_anno_index(i)); }); if (!indices.size()) @@ -80,9 +93,9 @@ ::annotate_sequences(const std::vector if (!indices.capacity()) indices.reserve(data[t].first.size()); - graph_->map_to_nodes(data[t].first, [&](node_index i) { + call_annotated_nodes_offsets(*graph_, data[t].first, [&](node_index i, int64_t) { if (i > 0) - indices.push_back(graph_to_anno_index(i)); + indices.emplace_back(graph_to_anno_index(i)); }); } @@ -117,10 +130,10 @@ void AnnotatedDBG::add_kmer_counts(std::string_view sequence, indices.reserve(sequence.size() - dbg_.get_k() + 1); size_t end = 0; - graph_->map_to_nodes(sequence, [&](node_index i) { + call_annotated_nodes_offsets(dbg_, sequence, [&](node_index i, int64_t) { // only insert indexes for matched k-mers and shift counts accordingly if (i > 0) { - indices.push_back(graph_to_anno_index(i)); + indices.emplace_back(graph_to_anno_index(i)); kmer_counts[indices.size() - 1] = kmer_counts[end++]; } }); @@ -139,15 +152,30 @@ void AnnotatedDBG::add_kmer_coord(std::string_view sequence, if (sequence.size() < dbg_.get_k()) return; - std::vector indices = map_to_nodes(dbg_, sequence); + std::vector indices; + std::vector offsets; + call_annotated_nodes_offsets(dbg_, sequence, [&](node_index i, int64_t offset) { + indices.emplace_back(i); + offsets.emplace_back(offset); + }); std::lock_guard lock(mutex_); + auto it = offsets.begin(); + uint64_t last_coord = std::numeric_limits::max(); + int64_t last_offset = std::numeric_limits::max(); for (node_index i : indices) { // only insert coordinates for matched k-mers and increment the coordinates - if (i > 0) - annotator_->add_label_coord(graph_to_anno_index(i), labels, coord); + if (i > 0) { + if (last_coord == std::numeric_limits::max() || coord - *it != last_coord - last_offset) { + annotator_->add_label_coord(graph_to_anno_index(i), labels, coord - *it); + last_coord = coord; + last_offset = *it; + } + } + coord++; + ++it; } } @@ -156,10 +184,17 @@ void AnnotatedDBG::add_kmer_coords( assert(check_compatibility()); std::vector> ids; + std::vector> offsets; ids.reserve(data.size()); for (const auto &[sequence, labels, _] : data) { - if (sequence.size() >= dbg_.get_k()) - ids.push_back(map_to_nodes(dbg_, sequence)); + if (sequence.size() >= dbg_.get_k()) { + auto &id = ids.emplace_back(); + auto &offset = offsets.emplace_back(); + call_annotated_nodes_offsets(dbg_, sequence, [&](node_index i, int64_t o) { + id.emplace_back(i); + offset.emplace_back(o); + }); + } } std::lock_guard lock(mutex_); @@ -168,11 +203,20 @@ void AnnotatedDBG::add_kmer_coords( const auto &labels = std::get<1>(data[t]); uint64_t coord = std::get<2>(data[t]); + uint64_t last_coord = std::numeric_limits::max(); + int64_t last_offset = std::numeric_limits::max(); + auto it = offsets[t].begin(); for (node_index i : ids[t]) { // only insert coordinates for matched k-mers and increment the coordinates - if (i > 0) - annotator_->add_label_coord(graph_to_anno_index(i), labels, coord); + if (i > 0) { + if (last_coord == std::numeric_limits::max() || coord - *it != last_coord - last_offset) { + annotator_->add_label_coord(graph_to_anno_index(i), labels, coord - *it); + last_coord = coord; + last_offset = *it; + } + } coord++; + ++it; } } } @@ -200,10 +244,16 @@ void AnnotatedDBG::annotate_kmer_coords( coords[last].reserve(sequence.size() - dbg_.get_k() + 1); } - graph_->map_to_nodes(sequence, [&](node_index i) { + uint64_t last_coord = std::numeric_limits::max(); + int64_t last_offset = std::numeric_limits::max(); + call_annotated_nodes_offsets(*graph_, sequence, [&](node_index i, int64_t o) { if (i > 0) { - ids[last].push_back(graph_to_anno_index(i)); - coords[last].emplace_back(graph_to_anno_index(i), coord); + if (last_coord == std::numeric_limits::max() || coord - o != last_coord - last_offset) { + ids[last].push_back(graph_to_anno_index(i)); + coords[last].emplace_back(graph_to_anno_index(i), coord - o); + last_coord = coord; + last_offset = o; + } } coord++; }); @@ -238,7 +288,7 @@ std::vector