Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lower RAM implementation of slice_columns for BRWT #226

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,9 +206,7 @@ Requires `M*V/8 + Size(BRWT)` bytes of RAM, where `M` is the number of rows in t
./metagraph assemble -v <GRAPH_DIR>/graph.dbg \
--unitigs \
-a <GRAPH_DIR>/annotation.column.annodbg \
--label-mask-in LABEL_1 \
--label-mask-in LABEL_2 \
--label-mask-out LABEL_3 \
--label-mask-file diff_assembly_experiment_file.txt \
-o diff_assembled.fa
```

Expand Down
94 changes: 94 additions & 0 deletions metagraph/benchmarks/benchmark_matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,4 +115,98 @@ BENCHMARK_TEMPLATE(BM_BRWTQueryRows, 3000000, 100, 30, 2, false, 0)
->Unit(benchmark::kMillisecond)
->DenseRange(0, 10, 1);


template <size_t rows_arg = 300000,
size_t cols_arg = 100,
size_t unique_arg = 10,
size_t arity_arg = 2,
bool greedy_arg = true,
size_t relax_arg = 2>
static void BM_BRWTQueryColumns(benchmark::State& state) {
DataGenerator generator;
generator.set_seed(42);

auto density_arg = std::vector<double>(unique_arg, state.range(0) / 100.);
auto generated_columns = generator.generate_random_columns(
rows_arg,
unique_arg,
get_densities(unique_arg, density_arg),
std::vector<uint32_t>(unique_arg, cols_arg / unique_arg)
);

std::unique_ptr<annot::binmat::BinaryMatrix> matrix = experiments::generate_brwt_from_rows(
std::move(generated_columns),
arity_arg,
greedy_arg,
relax_arg
);

std::vector<uint64_t> indexes;
call_ones(generator.generate_random_column(matrix->num_columns(), 1. / 10),
[&](uint64_t i) { indexes.push_back(i); }
);

for (auto _ : state) {
uint64_t j = 0;
#pragma omp parallel for num_threads(3)
for (size_t i = 0; i < indexes.size(); ++i) {
j += i;
for (auto t : matrix->get_column(indexes[i])) {
j += t;
}
}
}
}

BENCHMARK_TEMPLATE(BM_BRWTQueryColumns, 3000000, 100, 30, 2, false, 0)
->Unit(benchmark::kMillisecond)
->DenseRange(0, 10, 1);

template <size_t rows_arg = 300000,
size_t cols_arg = 100,
size_t unique_arg = 10,
size_t arity_arg = 2,
bool greedy_arg = true,
size_t relax_arg = 2>
static void BM_BRWTSliceColumns(benchmark::State& state) {
DataGenerator generator;
generator.set_seed(42);

auto density_arg = std::vector<double>(unique_arg, state.range(0) / 100.);
auto generated_columns = generator.generate_random_columns(
rows_arg,
unique_arg,
get_densities(unique_arg, density_arg),
std::vector<uint32_t>(unique_arg, cols_arg / unique_arg)
);

std::unique_ptr<annot::binmat::BinaryMatrix> matrix = experiments::generate_brwt_from_rows(
std::move(generated_columns),
arity_arg,
greedy_arg,
relax_arg
);

std::vector<uint64_t> indexes;
call_ones(generator.generate_random_column(matrix->num_columns(), 1. / 10),
[&](uint64_t i) { indexes.push_back(i); }
);

for (auto _ : state) {
uint64_t j = 0;
#pragma omp parallel num_threads(3)
#pragma omp single
{
matrix->slice_columns(indexes, [&](auto i, auto&& bitmap) {
j += i;
bitmap.call_ones([&](auto t) { j += t; });
});
}
}
}

BENCHMARK_TEMPLATE(BM_BRWTSliceColumns, 3000000, 100, 30, 2, false, 0)
->Unit(benchmark::kMillisecond)
->DenseRange(0, 10, 1);

} // namespace
11 changes: 11 additions & 0 deletions metagraph/src/annotation/binary_matrix/base/binary_matrix.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "binary_matrix.hpp"

#include "common/vectors/bitmap.hpp"
#include "common/vectors/bit_vector_adaptive.hpp"
#include "common/serialization.hpp"


Expand Down Expand Up @@ -32,6 +34,15 @@ BinaryMatrix::slice_rows(const std::vector<Row> &row_ids) const {
return slice;
}

void BinaryMatrix::slice_columns(const std::vector<Column> &column_ids,
const ColumnCallback &callback) const {
size_t nrows = num_rows();
for (size_t k = 0; k < column_ids.size(); ++k) {
Column j = column_ids[k];
callback(j, bitmap_generator(get_column(j), nrows));
}
}

template <typename RowType>
StreamRows<RowType>::StreamRows(const std::string &filename, size_t offset) {
std::ifstream instream(filename, std::ios::binary);
Expand Down
7 changes: 7 additions & 0 deletions metagraph/src/annotation/binary_matrix/base/binary_matrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include "common/vector.hpp"


class bitmap;

namespace mtg {
namespace annot {
namespace binmat {
Expand All @@ -21,6 +23,7 @@ class BinaryMatrix {
typedef Vector<Column> SetBitPositions;
typedef std::function<void(const SetBitPositions &)> RowCallback;
typedef std::function<void(Row, Column)> ValueCallback;
typedef std::function<void(Column, bitmap&&)> ColumnCallback;

virtual ~BinaryMatrix() {}

Expand All @@ -32,9 +35,13 @@ class BinaryMatrix {
virtual SetBitPositions get_row(Row row) const = 0;
virtual std::vector<SetBitPositions> get_rows(const std::vector<Row> &rows) const;
virtual std::vector<Row> get_column(Column column) const = 0;

// get all selected rows appended with -1 and concatenated
virtual std::vector<Column> slice_rows(const std::vector<Row> &rows) const;

virtual void slice_columns(const std::vector<Column> &columns,
const ColumnCallback &callback) const;

virtual bool load(std::istream &in) = 0;
virtual void serialize(std::ostream &out) const = 0;

Expand Down
97 changes: 97 additions & 0 deletions metagraph/src/annotation/binary_matrix/multi_brwt/brwt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
#include <queue>
#include <numeric>

#include <omp.h>

#include <tsl/hopscotch_map.h>

#include "common/algorithms.hpp"
#include "common/serialization.hpp"

Expand Down Expand Up @@ -189,6 +193,99 @@ std::vector<BRWT::Column> BRWT::slice_rows(const std::vector<Row> &row_ids) cons
return slice;
}

void BRWT::slice_columns(const std::vector<Column> &column_ids,
const ColumnCallback &callback) const {
Comment on lines +196 to +197
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

call_columns

if (column_ids.empty())
return;

auto num_nonzero_rows = nonzero_rows_->num_set_bits();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
auto num_nonzero_rows = nonzero_rows_->num_set_bits();
uint64_t num_nonzero_rows = nonzero_rows_->num_set_bits();


// check if the column is empty
if (!num_nonzero_rows)
return;
Comment on lines +203 to +205
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even if they are empty, you still need to call them. Add unit tests?


// check whether it is a leaf
if (!child_nodes_.size()) {
// return the index column
for (size_t k = 0; k < column_ids.size(); ++k) {
callback(column_ids[k], std::move(*nonzero_rows_->copy()));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better call a const reference, so the column can be copied by the caller if it's needed, and otherwise, there is no overhead.

Suggested change
callback(column_ids[k], std::move(*nonzero_rows_->copy()));
callback(column_ids[k], *nonzero_rows_);

Comment on lines +210 to +211
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not range-based loop?
for (size_t col_id : column_ids) {
...
}

}

return;
}

tsl::hopscotch_map<uint32_t, std::vector<Column>> child_columns_map;
for (size_t i = 0; i < column_ids.size(); ++i) {
assert(column_ids[i] < num_columns());
auto child_node = assignments_.group(column_ids[i]);
auto child_column = assignments_.rank(column_ids[i]);

auto it = child_columns_map.find(child_node);
if (it == child_columns_map.end())
it = child_columns_map.emplace(child_node, std::vector<Column>{}).first;

it.value().push_back(child_column);
}

auto process = [&](auto child_node, auto *child_columns_ptr) {
if (num_nonzero_rows == nonzero_rows_->size()) {
child_nodes_[child_node]->slice_columns(*child_columns_ptr,
[&](Column j, bitmap&& rows) {
callback(assignments_.get(child_node, j), std::move(rows));
}
);
} else {
const BRWT *child_node_brwt = dynamic_cast<const BRWT*>(
child_nodes_[child_node].get()
);
if (child_node_brwt
&& child_columns_ptr->size() > 1
&& !child_node_brwt->child_nodes_.size()) {
// if there are multiple column ids corresponding to the same leaf
// node, then this branch avoids doing redundant select1 calls
const auto *nonzero_rows = child_node_brwt->nonzero_rows_.get();
size_t num_nonzero_rows = nonzero_rows->num_set_bits();
if (num_nonzero_rows) {
std::vector<uint64_t> set_bits;
set_bits.reserve(num_nonzero_rows);
nonzero_rows->call_ones([&](auto i) {
set_bits.push_back(nonzero_rows->select1(i + 1));
});

for (size_t k = 0; k < child_columns_ptr->size() - 1; ++k) {
callback(assignments_.get(child_node, (*child_columns_ptr)[k]),
bitmap_generator(std::move(set_bits), num_rows()));
}

callback(assignments_.get(child_node, child_columns_ptr->back()),
bitmap_generator(std::move(set_bits), num_rows()));
}
} else {
Comment on lines +217 to +263
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add some comments to explain why this is going to make things faster than the basic call?

child_nodes_[child_node]->slice_columns(*child_columns_ptr,
[&](Column j, bitmap&& rows) {
size_t num_set_bits = rows.num_set_bits();
callback(assignments_.get(child_node, j),
bitmap_generator(std::move(rows), [&](uint64_t i) {
return nonzero_rows_->select1(i + 1);
}, num_rows(), num_set_bits));
}
);
}
}
};

for (auto it = ++child_columns_map.begin(); it != child_columns_map.end(); ++it) {
auto child_node = it->first;
auto *child_columns_ptr = &it->second;
#pragma omp task firstprivate(child_node, child_columns_ptr)
process(child_node, child_columns_ptr);
}

process(child_columns_map.begin()->first, &child_columns_map.begin()->second);

#pragma omp taskwait
}

std::vector<BRWT::Row> BRWT::get_column(Column column) const {
assert(column < num_columns());

Expand Down
3 changes: 3 additions & 0 deletions metagraph/src/annotation/binary_matrix/multi_brwt/brwt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ class BRWT : public BinaryMatrix {
// get all selected rows appended with -1 and concatenated
std::vector<Column> slice_rows(const std::vector<Row> &rows) const override;

void slice_columns(const std::vector<Column> &columns,
const ColumnCallback &callback) const override;

bool load(std::istream &in) override;
void serialize(std::ostream &out) const override;

Expand Down
19 changes: 19 additions & 0 deletions metagraph/src/annotation/binary_matrix/rainbowfish/rainbow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,25 @@ Rainbow<MatrixType>::get_column(Column column) const {
return row_indices;
}

template <class MatrixType>
void
Rainbow<MatrixType>::slice_columns(const std::vector<Column> &columns,
const ColumnCallback &callback) const {
uint64_t nrows = num_rows();
sdsl::bit_vector code_column(reduced_matrix_.num_rows());
reduced_matrix_.slice_columns(columns, [&](Column j, bitmap&& rows) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
reduced_matrix_.slice_columns(columns, [&](Column j, bitmap&& rows) {
reduced_matrix_.slice_columns(columns, [&](Column j, bitmap&& reduced_column) {

sdsl::util::set_to_value(code_column, false);
rows.add_to(&code_column);

callback(j, bitmap_generator([&](const auto &index_callback) {
for (uint64_t i = 0; i < nrows; ++i) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will take forever. Make it parallel

Suggested change
for (uint64_t i = 0; i < nrows; ++i) {
#pragma parallel num_threads(get_num_threads())
for (uint64_t i = 0; i < nrows; ++i) {

if (code_column[get_code(i)])
index_callback(i);
}
}, nrows));
});
}

template <class MatrixType>
bool Rainbow<MatrixType>::load(std::istream &in) {
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ class Rainbow : public RainbowMatrix {
size_t num_threads = 1) const override;
std::vector<Row> get_column(Column column) const override;

void slice_columns(const std::vector<Column> &columns,
const ColumnCallback &callback) const override;
Comment on lines +43 to +44
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename to call_columns


bool load(std::istream &in) override;
void serialize(std::ostream &out) const override;

Expand Down
Loading