Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use RowDiff with DBGHashFast in test_aligner_labeled #505

Open
wants to merge 22 commits into
base: rowdiff
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions metagraph/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
add_compile_options(
-Wno-exit-time-destructors
-Wno-deprecated-declarations
-Wno-vla-extension
)

if (NOT CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")
Expand Down
9 changes: 8 additions & 1 deletion metagraph/integration_tests/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,14 @@

script_path = os.path.dirname(os.path.realpath(__file__))

METAGRAPH = f'{os.getcwd()}/metagraph'
METAGRAPH_EXE = f'{os.getcwd()}/metagraph'
DNA_MODE = os.readlink(METAGRAPH_EXE).endswith("_DNA")
PROTEIN_MODE = os.readlink(METAGRAPH_EXE).endswith("_Protein")
METAGRAPH = METAGRAPH_EXE

def update_prefix(PREFIX):
global METAGRAPH
METAGRAPH = PREFIX + METAGRAPH_EXE

TEST_DATA_DIR = os.path.join(script_path, '..', 'tests', 'data')

Expand Down
5 changes: 5 additions & 0 deletions metagraph/integration_tests/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import sys
import argparse
from helpers import TimeLoggingTestResult
from base import update_prefix


"""Run all integration tests"""
Expand Down Expand Up @@ -32,7 +33,11 @@ def create_test_suite(filter_pattern="*"):
parser = argparse.ArgumentParser(description='Metagraph integration tests.')
parser.add_argument('--test_filter', dest='filter', type=str, default="*",
help='filter test cases (default: run all)')
parser.add_argument('--gdb', dest='use_gdb', action='store_true',
help='run metagraph with gdb')
args = parser.parse_args()
if args.use_gdb:
update_prefix('gdb -ex run -ex bt -ex quit --args ')

result = unittest.TextTestRunner(
resultclass=TimeLoggingTestResult
Expand Down
9 changes: 3 additions & 6 deletions metagraph/integration_tests/test_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,11 @@
import glob
import os

from base import TestingBase, METAGRAPH, TEST_DATA_DIR, NUM_THREADS
from base import PROTEIN_MODE, DNA_MODE, TestingBase, METAGRAPH, TEST_DATA_DIR, NUM_THREADS


"""Test graph construction and alignment"""

DNA_MODE = os.readlink(METAGRAPH).endswith("_DNA")
PROTEIN_MODE = os.readlink(METAGRAPH).endswith("_Protein")

graph_file_extension = {'succinct': '.dbg',
'bitmap': '.bitmapdbg',
'hash': '.orhashdbg',
Expand Down Expand Up @@ -341,7 +338,7 @@ def test_simple_align_fwd_rev_comp_json_all_graphs(self, representation):

params = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation])
self.assertEqual('11', params['k'])
self.assertEqual('16461', params['nodes (k)'])
self.assertEqual('16438', params['nodes (k)'])
self.assertEqual('basic', params['mode'])

stats_command = '{exe} align --json -i {graph} --align-min-exact-match 0.0 {reads}'.format(
Expand All @@ -366,7 +363,7 @@ def test_simple_align_edit_distance_all_graphs(self, representation):

params = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation])
self.assertEqual('11', params['k'])
self.assertEqual('16461', params['nodes (k)'])
self.assertEqual('16438', params['nodes (k)'])
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This number decreases if valid_edges is generated and serialized for DBGSuccinct. Probably because dummy nodes are excluded this way. Same with the change above.

self.assertEqual('basic', params['mode'])

stats_command = '{exe} align --json --align-edit-distance -i {graph} --align-min-exact-match 0.0 {reads}'.format(
Expand Down
4 changes: 1 addition & 3 deletions metagraph/integration_tests/test_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,11 @@
import filecmp
import glob
import os
from base import TestingBase, METAGRAPH, TEST_DATA_DIR, NUM_THREADS
from base import PROTEIN_MODE, TestingBase, METAGRAPH, TEST_DATA_DIR, NUM_THREADS


"""Test graph annotation"""

PROTEIN_MODE = os.readlink(METAGRAPH).endswith("_Protein")

graph_file_extension = {'succinct': '.dbg',
'bitmap': '.bitmapdbg',
'hash': '.orhashdbg',
Expand Down
5 changes: 1 addition & 4 deletions metagraph/integration_tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,7 @@
from concurrent.futures import Future
from parameterized import parameterized, parameterized_class

from base import TestingBase, METAGRAPH, TEST_DATA_DIR

PROTEIN_MODE = os.readlink(METAGRAPH).endswith("_Protein")

from base import PROTEIN_MODE, TestingBase, METAGRAPH, TEST_DATA_DIR

class TestAPIBase(TestingBase):
@classmethod
Expand Down
4 changes: 1 addition & 3 deletions metagraph/integration_tests/test_assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,12 @@
import gzip
import itertools
from helpers import get_test_class_name
from base import TestingBase, graph_file_extension, METAGRAPH, TEST_DATA_DIR, NUM_THREADS
from base import PROTEIN_MODE, TestingBase, graph_file_extension, METAGRAPH, TEST_DATA_DIR, NUM_THREADS
from test_query import anno_file_extension, GRAPH_TYPES, ANNO_TYPES, product


"""Test graph assemble"""

PROTEIN_MODE = os.readlink(METAGRAPH).endswith("_Protein")

gfa_tests = {
'compacted': {
'fasta_path': TEST_DATA_DIR + '/transcripts_100.fa',
Expand Down
5 changes: 1 addition & 4 deletions metagraph/integration_tests/test_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,11 @@
from tempfile import TemporaryDirectory
import glob
import os
from base import TestingBase, METAGRAPH, TEST_DATA_DIR
from base import PROTEIN_MODE, DNA_MODE, TestingBase, METAGRAPH, TEST_DATA_DIR


"""Test graph construction"""

PROTEIN_MODE = os.readlink(METAGRAPH).endswith("_Protein")
DNA_MODE = os.readlink(METAGRAPH).endswith("_DNA")

graph_file_extension = {'succinct': '.dbg',
'bitmap': '.bitmapdbg',
'hash': '.orhashdbg',
Expand Down
4 changes: 1 addition & 3 deletions metagraph/integration_tests/test_build_weighted.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,11 @@
import glob
import os
import gzip
from base import TestingBase, METAGRAPH, TEST_DATA_DIR
from base import PROTEIN_MODE, TestingBase, METAGRAPH, TEST_DATA_DIR


"""Test graph construction"""

PROTEIN_MODE = os.readlink(METAGRAPH).endswith("_Protein")

graph_file_extension = {'succinct': '.dbg',
'bitmap': '.bitmapdbg',
'hash': '.orhashdbg',
Expand Down
4 changes: 1 addition & 3 deletions metagraph/integration_tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,11 @@
import glob
import os
import gzip
from base import TestingBase, METAGRAPH, TEST_DATA_DIR, NUM_THREADS
from base import PROTEIN_MODE, TestingBase, METAGRAPH, TEST_DATA_DIR, NUM_THREADS


"""Test graph construction"""

PROTEIN_MODE = os.readlink(METAGRAPH).endswith("_Protein")

graph_file_extension = {'succinct': '.dbg',
'bitmap': '.bitmapdbg',
'hash': '.orhashdbg',
Expand Down
5 changes: 1 addition & 4 deletions metagraph/integration_tests/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,12 @@
import os
import numpy as np
from helpers import get_test_class_name
from base import TestingBase, METAGRAPH, TEST_DATA_DIR, graph_file_extension
from base import PROTEIN_MODE, DNA_MODE, TestingBase, METAGRAPH, TEST_DATA_DIR, graph_file_extension
import hashlib


"""Test graph construction"""

DNA_MODE = os.readlink(METAGRAPH).endswith("_DNA")
PROTEIN_MODE = os.readlink(METAGRAPH).endswith("_Protein")

anno_file_extension = {'column': '.column.annodbg',
'column_coord': '.column_coord.annodbg',
'brwt_coord': '.brwt_coord.annodbg',
Expand Down
2 changes: 0 additions & 2 deletions metagraph/integration_tests/test_transform_anno.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@

"""Test operations on annotation columns"""

DNA_MODE = os.readlink(METAGRAPH).endswith("_DNA")
PROTEIN_MODE = os.readlink(METAGRAPH).endswith("_Protein")
TEST_DATA_DIR = os.path.dirname(os.path.realpath(__file__)) + '/../tests/data'

NUM_THREADS = 4
Expand Down
2 changes: 1 addition & 1 deletion metagraph/src/annotation/annotation_converters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ convert_row_diff_to_BRWT(RowDiffColumnAnnotator &&annotator,
BRWTBottomUpBuilder::Partitioner partitioning,
size_t num_parallel_nodes,
size_t num_threads) {
const graph::DBGSuccinct* graph = annotator.get_matrix().graph();
const graph::DeBruijnGraph* graph = annotator.get_matrix().graph();

auto matrix = std::make_unique<BRWT>(
BRWTBottomUpBuilder::build(std::move(annotator.release_matrix()->diffs().data()),
Expand Down
69 changes: 46 additions & 23 deletions metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,27 @@

namespace mtg {
namespace annot {

using node_index = graph::DeBruijnGraph::node_index;

node_index row_diff_successor(const graph::DeBruijnGraph &graph,
node_index node,
const bit_vector &rd_succ) {
if (auto* dbg_succ = dynamic_cast<graph::DBGSuccinct const*>(&graph)) {
return dbg_succ->get_boss().row_diff_successor(node, rd_succ);
} else {
node_index succ = graph::DeBruijnGraph::npos;
graph.adjacent_outgoing_nodes(node, [&](node_index adjacent_node) {
if (rd_succ[adjacent_node]) {
succ = adjacent_node;
}
});
assert(graph.in_graph(succ) && "a row diff successor must exist");
return succ;
}
}


namespace matrix {

void IRowDiff::load_anchor(const std::string &filename) {
Expand Down Expand Up @@ -41,7 +62,7 @@ void IRowDiff::load_fork_succ(const std::string &filename) {
std::tuple<std::vector<BinaryMatrix::Row>, std::vector<std::vector<size_t>>, std::vector<size_t>>
IRowDiff::get_rd_ids(const std::vector<BinaryMatrix::Row> &row_ids) const {
assert(graph_ && "graph must be loaded");
assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size());
assert(!fork_succ_.size() || fork_succ_.size() == graph_->max_index() + 1);

using Row = BinaryMatrix::Row;

Expand All @@ -54,34 +75,36 @@ IRowDiff::get_rd_ids(const std::vector<BinaryMatrix::Row> &row_ids) const {
// been reached before, and thus, will be reconstructed before this one.
std::vector<std::vector<size_t>> rd_paths_trunc(row_ids.size());

const graph::boss::BOSS &boss = graph_->get_boss();
const bit_vector &rd_succ = fork_succ_.size() ? fork_succ_ : boss.get_last();

for (size_t i = 0; i < row_ids.size(); ++i) {
Row row = row_ids[i];

graph::boss::BOSS::edge_index boss_edge =
graph::AnnotatedSequenceGraph::anno_to_graph_index(row);
node_index node = graph::AnnotatedSequenceGraph::anno_to_graph_index(row);

while (true) {
if (graph_->is_valid(boss_edge)) {
row = graph::AnnotatedSequenceGraph::graph_to_anno_index(boss_edge);

auto [it, is_new] = node_to_rd.try_emplace(row, node_to_rd.size());
rd_paths_trunc[i].push_back(it.value());

// If a node had been reached before, we interrupt the diff path.
// The annotation for that node will have been reconstructed earlier
// than for other nodes in this path as well. Thus, we will start
// reconstruction from that node and don't need its successors.
if (!is_new)
break;

if (anchor_[row])
break;
assert(graph_->in_graph(node));
row = graph::AnnotatedSequenceGraph::graph_to_anno_index(node);

auto [it, is_new] = node_to_rd.try_emplace(row, node_to_rd.size());
rd_paths_trunc[i].push_back(it.value());

// If a node had been reached before, we interrupt the diff path.
// The annotation for that node will have been reconstructed earlier
// than for other nodes in this path as well. Thus, we will start
// reconstruction from that node and don't need its successors.
if (!is_new)
break;

if (anchor_[row])
break;

if (fork_succ_.size()) {
node = row_diff_successor(*graph_, node, fork_succ_);
} else { // Only happens in DBGSuccinct
auto *dbg_succ = dynamic_cast<graph::DBGSuccinct const*>(graph_);
assert(dbg_succ);
node = row_diff_successor(*graph_, node, dbg_succ->get_boss().get_last());
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be problematic to, instead of doing this, assign get_last() to fork_succ during its load, if it was detected that it is empty? I assume in this case we would also need to change fork_succ_ itself into a pointer to bit_vector?

}

boss_edge = boss.row_diff_successor(boss_edge, rd_succ);

}
}

Expand Down
20 changes: 12 additions & 8 deletions metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@

namespace mtg {
namespace annot {

graph::DeBruijnGraph::node_index row_diff_successor(const graph::DeBruijnGraph &graph,
graph::DeBruijnGraph::node_index node,
const bit_vector &rd_succ);

namespace matrix {

const std::string kRowDiffAnchorExt = ".anchors";
Expand All @@ -34,8 +39,8 @@ class IRowDiff {

virtual ~IRowDiff() {}

const graph::DBGSuccinct* graph() const { return graph_; }
void set_graph(const graph::DBGSuccinct *graph) { graph_ = graph; }
const graph::DeBruijnGraph* graph() const { return graph_; }
void set_graph(const graph::DeBruijnGraph *graph) { graph_ = graph; }

void load_fork_succ(const std::string &filename);
void load_anchor(const std::string &filename);
Expand All @@ -49,7 +54,7 @@ class IRowDiff {
std::tuple<std::vector<BinaryMatrix::Row>, std::vector<std::vector<size_t>>, std::vector<size_t>>
get_rd_ids(const std::vector<BinaryMatrix::Row> &row_ids) const;

const graph::DBGSuccinct *graph_ = nullptr;
const graph::DeBruijnGraph *graph_ = nullptr;
anchor_bv_type anchor_;
fork_succ_bv_type fork_succ_;
};
Expand Down Expand Up @@ -79,7 +84,7 @@ template <class BaseMatrix>
class RowDiff : public IRowDiff, public BinaryMatrix {
public:
template <typename... Args>
RowDiff(const graph::DBGSuccinct *graph = nullptr, Args&&... args)
RowDiff(const graph::DeBruijnGraph *graph = nullptr, Args&&... args)
: diffs_(std::forward<Args>(args)...) { graph_ = graph; }

/**
Expand Down Expand Up @@ -117,15 +122,14 @@ std::vector<BinaryMatrix::Row> RowDiff<BaseMatrix>::get_column(Column column) co
assert(graph_ && "graph must be loaded");
assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded");

const graph::boss::BOSS &boss = graph_->get_boss();
assert(!fork_succ_.size() || fork_succ_.size() == boss.get_last().size());
assert(!fork_succ_.size() || fork_succ_.size() == graph_->max_index() + 1);

std::vector<Row> result;
// TODO: implement a more efficient algorithm
for (Row row = 0; row < num_rows(); ++row) {
auto edge = graph::AnnotatedSequenceGraph::anno_to_graph_index(row);

if (!boss.get_W(edge))
if (!graph_->in_graph(edge))
continue;

SetBitPositions set_bits = get_rows({ row })[0];
Expand All @@ -140,7 +144,7 @@ std::vector<BinaryMatrix::SetBitPositions>
RowDiff<BaseMatrix>::get_rows(const std::vector<Row> &row_ids) const {
assert(graph_ && "graph must be loaded");
assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded");
assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size());
assert(!fork_succ_.size() || fork_succ_.size() == graph_->max_index() + 1);

// get row-diff paths
auto [rd_ids, rd_paths_trunc, times_traversed] = get_rd_ids(row_ids);
Expand Down
Loading