From c842ee84ba0dd953f86e5c1e0e10616637ece160 Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Tue, 8 Oct 2024 11:47:56 +0200 Subject: [PATCH 01/30] Fix for queries with invalid characters --- .../graph/representation/canonical_dbg.cpp | 2 +- .../graph/representation/hash/dbg_sshash.cpp | 47 ++++++++++++++----- .../graph/representation/hash/dbg_sshash.hpp | 6 +++ .../tests/annotation/test_annotated_dbg.cpp | 7 ++- .../annotation/test_annotated_dbg_helpers.cpp | 1 + .../tests/graph/all/test_dbg_helpers.cpp | 7 ++- 6 files changed, 53 insertions(+), 17 deletions(-) diff --git a/metagraph/src/graph/representation/canonical_dbg.cpp b/metagraph/src/graph/representation/canonical_dbg.cpp index fdd4bd683e..a944cd958c 100644 --- a/metagraph/src/graph/representation/canonical_dbg.cpp +++ b/metagraph/src/graph/representation/canonical_dbg.cpp @@ -64,7 +64,7 @@ ::map_to_nodes_sequentially(std::string_view sequence, path.reserve(sequence.size() - get_k() + 1); if (const auto sshash = std::dynamic_pointer_cast(graph_)) { - sshash->map_to_nodes_with_rc<>(sequence, [&](node_index node, bool orientation) { + sshash->map_to_nodes_with_rc(sequence, [&](node_index node, bool orientation) { callback(node && orientation ? reverse_complement(node) : node); }, terminate); return; diff --git a/metagraph/src/graph/representation/hash/dbg_sshash.cpp b/metagraph/src/graph/representation/hash/dbg_sshash.cpp index 9e47d1fe3d..ecda8b57ea 100644 --- a/metagraph/src/graph/representation/hash/dbg_sshash.cpp +++ b/metagraph/src/graph/representation/hash/dbg_sshash.cpp @@ -5,6 +5,7 @@ #include "common/seq_tools/reverse_complement.hpp" #include "common/threads/threading.hpp" #include "common/logger.hpp" +#include "common/algorithms.hpp" #include "kmer/kmer_extractor.hpp" @@ -99,6 +100,37 @@ void DBGSSHash::add_sequence(std::string_view sequence, throw std::logic_error("adding sequences not supported"); } +void DBGSSHash +::map_to_nodes_with_rc_advanced(std::string_view sequence, + const std::function& callback, + bool with_rc, + const std::function& terminate) const { + if (terminate() || sequence.size() < k_) + return; + + std::visit([&](const auto &dict) { + using kmer_t = get_kmer_t; + + std::vector seq_encoded; + seq_encoded.reserve(sequence.size()); + for (size_t i = 0; i < sequence.size(); ++i) { + char enc = kmer_t::canonicalize_basepair_forward_map[static_cast(sequence[i])]; + seq_encoded.emplace_back(enc == '\0'); + } + + auto invalid = utils::drag_and_mark_segments(seq_encoded, 1, k_); + + kmer_t uint_kmer = sshash::util::string_to_uint_kmer(sequence.data(), k_ - 1); + uint_kmer.pad_char(); + for (size_t i = k_ - 1; i < sequence.size() && !terminate(); ++i) { + uint_kmer.drop_char(); + uint_kmer.kth_char_or(k_ - 1, kmer_t::char_to_uint(sequence[i])); + callback(!invalid[i] ? dict.lookup_advanced_uint(uint_kmer, with_rc) + : sshash::lookup_result()); + } + }, dict_); +} + template void DBGSSHash::map_to_nodes_with_rc(std::string_view sequence, const std::function& callback, @@ -113,18 +145,11 @@ void DBGSSHash::map_to_nodes_with_rc(std::string_view sequence, return; } - std::visit([&](const auto &dict) { - using kmer_t = get_kmer_t; - kmer_t uint_kmer = sshash::util::string_to_uint_kmer(sequence.data(), k_ - 1); - uint_kmer.pad_char(); - for (size_t i = k_ - 1; i < sequence.size() && !terminate(); ++i) { - uint_kmer.drop_char(); - uint_kmer.kth_char_or(k_ - 1, kmer_t::char_to_uint(sequence[i])); - auto res = dict.lookup_advanced_uint(uint_kmer, with_rc); - callback(sshash_to_graph_index(res.kmer_id), res.kmer_orientation); - } - }, dict_); + map_to_nodes_with_rc_advanced(sequence, [&](sshash::lookup_result res) { + callback(sshash_to_graph_index(res.kmer_id), res.kmer_orientation); + }, with_rc, terminate); } + template void DBGSSHash::map_to_nodes_with_rc(std::string_view, const std::function&, diff --git a/metagraph/src/graph/representation/hash/dbg_sshash.hpp b/metagraph/src/graph/representation/hash/dbg_sshash.hpp index 12c1a407bd..0f3da5dc5a 100644 --- a/metagraph/src/graph/representation/hash/dbg_sshash.hpp +++ b/metagraph/src/graph/representation/hash/dbg_sshash.hpp @@ -117,6 +117,12 @@ class DBGSSHash : public DeBruijnGraph { size_t num_nodes_; Mode mode_; + void map_to_nodes_with_rc_advanced( + std::string_view sequence, + const std::function& callback, + bool with_rc, + const std::function& terminate = []() { return false; }) const; + size_t dict_size() const; }; diff --git a/metagraph/tests/annotation/test_annotated_dbg.cpp b/metagraph/tests/annotation/test_annotated_dbg.cpp index 278f01f1f0..d1437aa725 100644 --- a/metagraph/tests/annotation/test_annotated_dbg.cpp +++ b/metagraph/tests/annotation/test_annotated_dbg.cpp @@ -4,15 +4,12 @@ #include "gtest/gtest.h" #include "../test_helpers.hpp" +#include "../graph/all/test_dbg_helpers.hpp" #include "common/threads/threading.hpp" #include "common/vectors/bit_vector_dyn.hpp" #include "common/vectors/vector_algorithm.hpp" #include "annotation/representation/column_compressed/annotate_column_compressed.hpp" -#include "graph/representation/bitmap/dbg_bitmap.hpp" -#include "graph/representation/hash/dbg_hash_string.hpp" -#include "graph/representation/hash/dbg_hash_ordered.hpp" -#include "graph/representation/hash/dbg_hash_fast.hpp" #define protected public #define private public @@ -987,6 +984,7 @@ typedef ::testing::Types>, std::pair>, std::pair>, std::pair>, + std::pair>, std::pair, std::pair, std::pair, @@ -1016,6 +1014,7 @@ class AnnotatedDBGNoNTest : public ::testing::Test {}; typedef ::testing::Types>, std::pair>, std::pair>, + std::pair>, std::pair, std::pair, std::pair, diff --git a/metagraph/tests/annotation/test_annotated_dbg_helpers.cpp b/metagraph/tests/annotation/test_annotated_dbg_helpers.cpp index 2c6b1f5735..39e387335e 100644 --- a/metagraph/tests/annotation/test_annotated_dbg_helpers.cpp +++ b/metagraph/tests/annotation/test_annotated_dbg_helpers.cpp @@ -235,6 +235,7 @@ template std::unique_ptr build_anno_graph build_anno_graph>(uint64_t, const std::vector &, const std::vector&, DeBruijnGraph::Mode, bool); template std::unique_ptr build_anno_graph>(uint64_t, const std::vector &, const std::vector&, DeBruijnGraph::Mode, bool); template std::unique_ptr build_anno_graph>(uint64_t, const std::vector &, const std::vector&, DeBruijnGraph::Mode, bool); +template std::unique_ptr build_anno_graph>(uint64_t, const std::vector &, const std::vector&, DeBruijnGraph::Mode, bool); template std::unique_ptr build_anno_graph(uint64_t, const std::vector &, const std::vector&, DeBruijnGraph::Mode, bool); template std::unique_ptr build_anno_graph(uint64_t, const std::vector &, const std::vector&, DeBruijnGraph::Mode, bool); diff --git a/metagraph/tests/graph/all/test_dbg_helpers.cpp b/metagraph/tests/graph/all/test_dbg_helpers.cpp index 82c6878024..a4d151a093 100644 --- a/metagraph/tests/graph/all/test_dbg_helpers.cpp +++ b/metagraph/tests/graph/all/test_dbg_helpers.cpp @@ -1,6 +1,10 @@ #include "test_dbg_helpers.hpp" +#include "../../annotation/test_annotated_dbg_helpers.hpp" +#include "annotation/representation/column_compressed/annotate_column_compressed.hpp" + #include "gtest/gtest.h" +#include "graph/annotated_dbg.hpp" #include "graph/representation/canonical_dbg.hpp" #include "graph/representation/succinct/boss.hpp" #include "graph/representation/succinct/boss_construct.hpp" @@ -146,6 +150,7 @@ void writeFastaFile(const std::vector& sequences, const std::string fastaFile.close(); } + template <> std::shared_ptr build_graph(uint64_t k, @@ -155,7 +160,7 @@ build_graph(uint64_t k, return std::make_shared(k, mode); // use DBGHashString to get contigs for SSHash - auto string_graph = build_graph(k, sequences, mode); + auto string_graph = build_graph(k, sequences, mode); std::vector contigs; size_t num_kmers = 0; From dbc0cb8f53925554c6ff87a5eb9cc272d12de1fd Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Tue, 8 Oct 2024 11:51:10 +0200 Subject: [PATCH 02/30] indicate colour breakpoints to support monotig graphs with sshash --- metagraph/src/cli/build.cpp | 6 +- metagraph/src/cli/config/config.cpp | 3 + metagraph/src/cli/config/config.hpp | 1 + metagraph/src/cli/query.cpp | 5 +- .../src/graph/alignment/annotation_buffer.cpp | 1 + metagraph/src/graph/annotated_dbg.cpp | 101 ++++++++++---- metagraph/src/graph/annotated_dbg.hpp | 7 +- .../representation/base/sequence_graph.hpp | 2 + .../graph/representation/canonical_dbg.hpp | 2 + .../graph/representation/hash/dbg_sshash.cpp | 127 +++++++++++++++++- .../graph/representation/hash/dbg_sshash.hpp | 12 +- .../tests/annotation/test_annotated_dbg.cpp | 22 +-- .../annotation/test_annotated_dbg_helpers.cpp | 1 + .../tests/graph/all/test_dbg_helpers.cpp | 72 ++++++++++ .../tests/graph/all/test_dbg_helpers.hpp | 8 ++ metagraph/tests/graph/test_canonical_dbg.cpp | 3 +- metagraph/tests/graph/test_dbg_canonical.cpp | 3 +- 17 files changed, 335 insertions(+), 41 deletions(-) diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp index d58cbde119..6a193d7214 100644 --- a/metagraph/src/cli/build.cpp +++ b/metagraph/src/cli/build.cpp @@ -251,12 +251,16 @@ int build_graph(Config *config) { } } else if (config->graph_type == Config::GraphType::SSHASH && !config->dynamic) { - graph.reset(new DBGSSHash(files.at(0), config->k, config->graph_mode, config->num_chars)); if (files.size() > 1) { logger->error("DBGSSHash does not support multiple input files."); exit(1); } + graph.reset(new DBGSSHash(files.at(0), + config->k, + config->graph_mode, + config->num_chars, + config->is_monochromatic)); } else { //slower method switch (config->graph_type) { diff --git a/metagraph/src/cli/config/config.cpp b/metagraph/src/cli/config/config.cpp index 7312ff93ba..fd81cfd26b 100644 --- a/metagraph/src/cli/config/config.cpp +++ b/metagraph/src/cli/config/config.cpp @@ -151,6 +151,8 @@ Config::Config(int argc, char *argv[]) { dynamic = true; } else if (!strcmp(argv[i], "--mask-dummy")) { mark_dummy_kmers = true; + } else if (!strcmp(argv[i], "--is-monochromatic")) { + is_monochromatic = true; } else if (!strcmp(argv[i], "--anno-filename")) { filename_anno = true; } else if (!strcmp(argv[i], "--anno-header")) { @@ -972,6 +974,7 @@ if (advanced) { fprintf(stderr, "\t --mode \t\tk-mer indexing mode: basic / canonical / primary [basic]\n"); #endif fprintf(stderr, "\t --complete \t\tconstruct a complete graph (only for Bitmap graph) [off]\n"); + fprintf(stderr, "\t --is-monochromatic \t\tindicate that the input sequences are monochromatic (i.e., their colouring is constant) [off]\n"); fprintf(stderr, "\t --mem-cap-gb [INT] \tpreallocated buffer size in GB [1]\n"); if (advanced) { fprintf(stderr, "\t --dynamic \t\tuse dynamic build method [off]\n"); diff --git a/metagraph/src/cli/config/config.hpp b/metagraph/src/cli/config/config.hpp index b4848812d8..aae5304270 100644 --- a/metagraph/src/cli/config/config.hpp +++ b/metagraph/src/cli/config/config.hpp @@ -28,6 +28,7 @@ class Config { bool complete = false; bool dynamic = false; bool mark_dummy_kmers = false; + bool is_monochromatic = false; bool filename_anno = false; bool annotate_sequence_headers = false; bool to_adj_list = false; diff --git a/metagraph/src/cli/query.cpp b/metagraph/src/cli/query.cpp index cb7d2e35c9..fcb41c61b5 100644 --- a/metagraph/src/cli/query.cpp +++ b/metagraph/src/cli/query.cpp @@ -955,8 +955,9 @@ construct_query_graph(const AnnotatedDBG &anno_graph, #pragma omp parallel for num_threads(num_threads) for (size_t i = 0; i < contigs.size(); ++i) { contigs[i].second.reserve(contigs[i].first.length() - graph_init->get_k() + 1); - full_dbg.map_to_nodes(contigs[i].first, - [&](node_index node) { contigs[i].second.push_back(node); }); + call_annotated_nodes_offsets(full_dbg, contigs[i].first, [&](node_index node, int64_t) { + contigs[i].second.push_back(node); + }); } logger->trace("[Query graph construction] Contigs mapped to the full graph in {} sec", timer.elapsed()); diff --git a/metagraph/src/graph/alignment/annotation_buffer.cpp b/metagraph/src/graph/alignment/annotation_buffer.cpp index 4020f312a7..16d501385b 100644 --- a/metagraph/src/graph/alignment/annotation_buffer.cpp +++ b/metagraph/src/graph/alignment/annotation_buffer.cpp @@ -48,6 +48,7 @@ void AnnotationBuffer::fetch_queued_annotations() { for (const auto &path : queued_paths_) { std::vector base_path; + std::vector base_path_offsets; if (base_graph->get_mode() == DeBruijnGraph::CANONICAL) { // TODO: avoid this call of spell_path std::string query = spell_path(graph_, path); diff --git a/metagraph/src/graph/annotated_dbg.cpp b/metagraph/src/graph/annotated_dbg.cpp index 430365365a..6fc6ea257c 100644 --- a/metagraph/src/graph/annotated_dbg.cpp +++ b/metagraph/src/graph/annotated_dbg.cpp @@ -6,6 +6,7 @@ #include "annotation/representation/row_compressed/annotate_row_compressed.hpp" #include "annotation/int_matrix/base/int_matrix.hpp" #include "graph/representation/canonical_dbg.hpp" +#include "graph/representation/hash/dbg_sshash.hpp" #include "common/aligned_vector.hpp" #include "common/vectors/vector_algorithm.hpp" #include "common/vector_map.hpp" @@ -23,6 +24,21 @@ using Column = mtg::annot::matrix::BinaryMatrix::Column; typedef AnnotatedDBG::Label Label; typedef std::pair StringCountPair; +void call_annotated_nodes_offsets(const SequenceGraph &graph, + std::string_view sequence, + const std::function &callback) { + if (const auto *sshash = dynamic_cast(&graph)) { + if (sshash->is_monochromatic()) { + sshash->map_to_contigs_with_rc(sequence, [&](SequenceGraph::node_index i, int64_t offset, bool) { + callback(i, offset); + }); + return; + } + } + + graph.map_to_nodes(sequence, [&](SequenceGraph::node_index i) { callback(i, 0); }); +} + AnnotatedSequenceGraph ::AnnotatedSequenceGraph(std::shared_ptr graph, @@ -46,10 +62,9 @@ ::annotate_sequence(std::string_view sequence, std::vector indices; indices.reserve(sequence.size()); - - graph_->map_to_nodes(sequence, [&](node_index i) { + call_annotated_nodes_offsets(*graph_, sequence, [&](node_index i, int64_t) { if (i > 0) - indices.push_back(graph_to_anno_index(i)); + indices.emplace_back(graph_to_anno_index(i)); }); if (!indices.size()) @@ -80,9 +95,9 @@ ::annotate_sequences(const std::vector if (!indices.capacity()) indices.reserve(data[t].first.size()); - graph_->map_to_nodes(data[t].first, [&](node_index i) { + call_annotated_nodes_offsets(*graph_, data[t].first, [&](node_index i, int64_t) { if (i > 0) - indices.push_back(graph_to_anno_index(i)); + indices.emplace_back(graph_to_anno_index(i)); }); } @@ -117,10 +132,10 @@ void AnnotatedDBG::add_kmer_counts(std::string_view sequence, indices.reserve(sequence.size() - dbg_.get_k() + 1); size_t end = 0; - graph_->map_to_nodes(sequence, [&](node_index i) { + call_annotated_nodes_offsets(dbg_, sequence, [&](node_index i, int64_t) { // only insert indexes for matched k-mers and shift counts accordingly if (i > 0) { - indices.push_back(graph_to_anno_index(i)); + indices.emplace_back(graph_to_anno_index(i)); kmer_counts[indices.size() - 1] = kmer_counts[end++]; } }); @@ -139,15 +154,22 @@ void AnnotatedDBG::add_kmer_coord(std::string_view sequence, if (sequence.size() < dbg_.get_k()) return; - std::vector indices = map_to_nodes(dbg_, sequence); + std::vector indices; + std::vector offsets; + call_annotated_nodes_offsets(dbg_, sequence, [&](node_index i, int64_t offset) { + indices.emplace_back(i); + offsets.emplace_back(offset); + }); std::lock_guard lock(mutex_); + auto it = offsets.begin(); for (node_index i : indices) { // only insert coordinates for matched k-mers and increment the coordinates if (i > 0) - annotator_->add_label_coord(graph_to_anno_index(i), labels, coord); + annotator_->add_label_coord(graph_to_anno_index(i), labels, coord - *it); coord++; + ++it; } } @@ -156,10 +178,17 @@ void AnnotatedDBG::add_kmer_coords( assert(check_compatibility()); std::vector> ids; + std::vector> offsets; ids.reserve(data.size()); for (const auto &[sequence, labels, _] : data) { - if (sequence.size() >= dbg_.get_k()) - ids.push_back(map_to_nodes(dbg_, sequence)); + if (sequence.size() >= dbg_.get_k()) { + auto &id = ids.emplace_back(); + auto &offset = offsets.emplace_back(); + call_annotated_nodes_offsets(dbg_, sequence, [&](node_index i, int64_t o) { + id.emplace_back(i); + offset.emplace_back(o); + }); + } } std::lock_guard lock(mutex_); @@ -168,11 +197,13 @@ void AnnotatedDBG::add_kmer_coords( const auto &labels = std::get<1>(data[t]); uint64_t coord = std::get<2>(data[t]); + auto it = offsets[t].begin(); for (node_index i : ids[t]) { // only insert coordinates for matched k-mers and increment the coordinates if (i > 0) - annotator_->add_label_coord(graph_to_anno_index(i), labels, coord); + annotator_->add_label_coord(graph_to_anno_index(i), labels, coord - *it); coord++; + ++it; } } } @@ -200,10 +231,10 @@ void AnnotatedDBG::annotate_kmer_coords( coords[last].reserve(sequence.size() - dbg_.get_k() + 1); } - graph_->map_to_nodes(sequence, [&](node_index i) { + call_annotated_nodes_offsets(*graph_, sequence, [&](node_index i, int64_t o) { if (i > 0) { ids[last].push_back(graph_to_anno_index(i)); - coords[last].emplace_back(graph_to_anno_index(i), coord); + coords[last].emplace_back(graph_to_anno_index(i), coord - o); } coord++; }); @@ -238,7 +269,7 @@ std::vector