Skip to content

Commit

Permalink
Add support for 2D tiled splits, split hints
Browse files Browse the repository at this point in the history
This adds two new task hints, `split_1d` and `split_2d`, which influence
the way tasks are split into chunks. The latter uses a new splitting
function of the same name; all splitting related functionality is moved
into a separate file and tests for both 1D and 2D splitting are
included.
  • Loading branch information
psalz committed Dec 5, 2023
1 parent 2908bc1 commit 404e325
Show file tree
Hide file tree
Showing 10 changed files with 550 additions and 49 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Versioning](http://semver.org/spec/v2.0.0.html).
- `distr_queue::fence` and `buffer_snapshot` are now stable, subsuming the `experimental::` APIs of the same name (#225)
- Celerity now warns at runtime when a task declares reads from uninitialized buffers or writes with overlapping ranges between nodes (#224)
- Introduce new `experimental::hint` API for providing the runtime with additional information on how to execute a task (#227)
- Introduce new `experimental::hints::split_1d` and `experimental::hints::split_2d` task hints for controlling how a task is split into chunks (#227)

### Changed

Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ set(SOURCES
src/recorders.cc
src/runtime.cc
src/scheduler.cc
src/split.cc
src/task.cc
src/task_manager.cc
src/user_bench.cc
Expand Down
29 changes: 28 additions & 1 deletion include/hint.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,31 @@ class hint_base {

} // namespace celerity::detail

namespace celerity::experimental::hints {}; // namespace celerity::experimental::hints
namespace celerity::experimental::hints {

/**
* Suggests that the task should be split into 1D chunks.
* This is currently the default behavior.
*/
class split_1d : public detail::hint_base {
private:
void validate(const hint_base& other) const override;
};

/**
* Suggests that the task should be split into 2D chunks.
*/
class split_2d : public detail::hint_base {
private:
void validate(const hint_base& other) const override;
};

inline void split_1d::validate(const hint_base& other) const {
if(dynamic_cast<const split_2d*>(&other) != nullptr) { throw std::runtime_error("Cannot combine split_1d and split_2d hints"); }
}

inline void split_2d::validate(const hint_base& other) const {
if(dynamic_cast<const split_1d*>(&other) != nullptr) { throw std::runtime_error("Cannot combine split_1d and split_2d hints"); }
}

}; // namespace celerity::experimental::hints
12 changes: 12 additions & 0 deletions include/split.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#pragma once

#include <vector>

#include "ranges.h"

namespace celerity::detail {

std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks);
std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks);

} // namespace celerity::detail
56 changes: 8 additions & 48 deletions src/distributed_graph_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "command.h"
#include "command_graph.h"
#include "recorders.h"
#include "split.h"
#include "task.h"
#include "task_manager.h"

Expand Down Expand Up @@ -35,53 +36,6 @@ void distributed_graph_generator::add_buffer(const buffer_id bid, const int dims
m_buffer_states.at(bid).replicated_regions.update_region(subrange<3>({}, range), node_bitset{}.set());
}

// We simply split in the first dimension for now
static std::vector<chunk<3>> split_equal(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks, const int dims) {
#ifndef NDEBUG
assert(num_chunks > 0);
for(int d = 0; d < dims; ++d) {
assert(granularity[d] > 0);
assert(full_chunk.range[d] % granularity[d] == 0);
}
#endif

// Due to split granularity requirements or if num_workers > global_size[0],
// we may not be able to create the requested number of chunks.
const auto actual_num_chunks = std::min(num_chunks, full_chunk.range[0] / granularity[0]);

// If global range is not divisible by (actual_num_chunks * granularity),
// assign ceil(quotient) to the first few chunks and floor(quotient) to the remaining
const auto small_chunk_size_dim0 = full_chunk.range[0] / (actual_num_chunks * granularity[0]) * granularity[0];
const auto large_chunk_size_dim0 = small_chunk_size_dim0 + granularity[0];
const auto num_large_chunks = (full_chunk.range[0] - small_chunk_size_dim0 * actual_num_chunks) / granularity[0];
assert(num_large_chunks * large_chunk_size_dim0 + (actual_num_chunks - num_large_chunks) * small_chunk_size_dim0 == full_chunk.range[0]);

std::vector<chunk<3>> result(actual_num_chunks, {full_chunk.offset, full_chunk.range, full_chunk.global_size});
for(auto i = 0u; i < num_large_chunks; ++i) {
result[i].range[0] = large_chunk_size_dim0;
result[i].offset[0] += i * large_chunk_size_dim0;
}
for(auto i = num_large_chunks; i < actual_num_chunks; ++i) {
result[i].range[0] = small_chunk_size_dim0;
result[i].offset[0] += num_large_chunks * large_chunk_size_dim0 + (i - num_large_chunks) * small_chunk_size_dim0;
}

#ifndef NDEBUG
size_t total_range_dim0 = 0;
for(size_t i = 0; i < result.size(); ++i) {
total_range_dim0 += result[i].range[0];
if(i == 0) {
assert(result[i].offset[0] == full_chunk.offset[0]);
} else {
assert(result[i].offset[0] == result[i - 1].offset[0] + result[i - 1].range[0]);
}
}
assert(total_range_dim0 == full_chunk.range[0]);
#endif

return result;
}

using buffer_requirements_map = std::unordered_map<buffer_id, std::unordered_map<access_mode, region<3>>>;

static buffer_requirements_map get_buffer_requirements_for_mapped_access(const task& tsk, subrange<3> sr, const range<3> global_size) {
Expand Down Expand Up @@ -179,7 +133,13 @@ void distributed_graph_generator::generate_distributed_commands(const task& tsk)
}
return chunks;
}
if(tsk.has_variable_split()) { return split_equal(full_chunk, tsk.get_granularity(), num_chunks, tsk.get_dimensions()); }
if(tsk.has_variable_split()) {
if(tsk.get_hint<experimental::hints::split_1d>() != nullptr) {
// no-op, keeping this for documentation purposes
}
if(tsk.get_hint<experimental::hints::split_2d>() != nullptr) { return split_2d(full_chunk, tsk.get_granularity(), num_chunks); }
return split_1d(full_chunk, tsk.get_granularity(), num_chunks);
}
return std::vector<chunk<3>>{full_chunk};
})();
assert(chunks.size() <= num_chunks); // We may have created less than requested
Expand Down
166 changes: 166 additions & 0 deletions src/split.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#include "split.h"

#include <array>
#include <tuple>

#include "grid.h"

namespace {

using namespace celerity;
using namespace celerity::detail;

[[maybe_unused]] void sanity_check_split(const chunk<3>& full_chunk, const std::vector<chunk<3>>& split) {
region<3> reconstructed_chunk;
for(auto& chnk : split) {
assert(region_intersection(reconstructed_chunk, box<3>(chnk)).empty());
reconstructed_chunk = region_union(box<3>(chnk), reconstructed_chunk);
}
assert(region_difference(reconstructed_chunk, box<3>(full_chunk)).empty());
}

template <int Dims>
std::tuple<range<Dims>, range<Dims>, range<Dims>> compute_small_and_large_chunks(
const chunk<3>& full_chunk, const range<3>& granularity, const std::array<size_t, Dims>& actual_num_chunks) {
range<Dims> small_chunk_size{zeros};
range<Dims> large_chunk_size{zeros};
range<Dims> num_large_chunks{zeros};
for(int d = 0; d < Dims; ++d) {
const size_t ideal_chunk_size = full_chunk.range[d] / actual_num_chunks[d];
small_chunk_size[d] = (ideal_chunk_size / granularity[d]) * granularity[d];
large_chunk_size[d] = small_chunk_size[d] + granularity[d];
num_large_chunks[d] = (full_chunk.range[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d];
}
return {small_chunk_size, large_chunk_size, num_large_chunks};
}

/**
* Given a factorization of `num_chunks` (i.e., `f0 * f1 = num_chunks`), try to find the assignment of factors to
* dimensions that produces more chunks under the given constraints. If they are tied, try to find the assignment
* that results in a "nicer" split according to some heuristics (see below).
*
* The single argument `factor` specifies both factors, as `f0 = factor` and `f1 = num_chunks / factor`.
*
* @returns The number of chunks that can be created in dimension 0 and dimension 1, respectively. These are at most
* (f0, f1) or (f1, f0), however may be less if constrained by the split granularity.
*/
std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) {
assert(num_chunks % factor == 0);
const size_t max_chunks[2] = {full_chunk.range[0] / granularity[0], full_chunk.range[1] / granularity[1]};
const size_t f0 = factor;
const size_t f1 = num_chunks / factor;

// Decide in which direction to split by first checking which
// factor assignment produces more chunks under the given constraints.
const std::array<size_t, 2> split_0_1 = {std::min(f0, max_chunks[0]), std::min(f1, max_chunks[1])};
const std::array<size_t, 2> split_1_0 = {std::min(f1, max_chunks[0]), std::min(f0, max_chunks[1])};
const auto count0 = split_0_1[0] * split_0_1[1];
const auto count1 = split_1_0[0] * split_1_0[1];

if(count0 > count1) { return split_0_1; }
if(count0 < count1) { return split_1_0; }

// If we're tied for the number of chunks we can create, try some heuristics to decide.

// If domain is square(-ish), prefer splitting along slower dimension.
// (These bounds have been chosen arbitrarily!)
const double squareishness = std::sqrt(full_chunk.range.size()) / static_cast<double>(full_chunk.range[0]);
if(squareishness > 0.95 && squareishness < 1.05) { return (f0 >= f1) ? split_0_1 : split_1_0; }

// For non-square domains, prefer split that produces shorter edges (compare sum of circumferences)
const auto circ0 = full_chunk.range[0] / split_0_1[0] + full_chunk.range[1] / split_0_1[1];
const auto circ1 = full_chunk.range[0] / split_1_0[0] + full_chunk.range[1] / split_1_0[1];
return circ0 < circ1 ? split_0_1 : split_1_0;

// TODO: Yet another heuristic we may want to consider is how even chunk sizes are,
// i.e., how balanced the workload is.
}

} // namespace

namespace celerity::detail {

std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
#ifndef NDEBUG
assert(num_chunks > 0);
for(int d = 0; d < 3; ++d) {
assert(granularity[d] > 0);
assert(full_chunk.range[d] % granularity[d] == 0);
}
#endif

// Due to split granularity requirements or if num_workers > global_size[0],
// we may not be able to create the requested number of chunks.
const std::array<size_t, 1> actual_num_chunks = {std::min(num_chunks, full_chunk.range[0] / granularity[0])};
const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<1>(full_chunk, granularity, actual_num_chunks);

std::vector<chunk<3>> result(actual_num_chunks[0], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
for(auto i = 0u; i < num_large_chunks[0]; ++i) {
result[i].range[0] = large_chunk_size[0];
result[i].offset[0] += i * large_chunk_size[0];
}
for(auto i = num_large_chunks[0]; i < actual_num_chunks[0]; ++i) {
result[i].range[0] = small_chunk_size[0];
result[i].offset[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0];
}

#ifndef NDEBUG
sanity_check_split(full_chunk, result);
#endif

return result;
}

// TODO: Make the split dimensions configurable for 3D chunks?
std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
#ifndef NDEBUG
assert(num_chunks > 0);
for(int d = 0; d < 3; ++d) {
assert(granularity[d] > 0);
assert(full_chunk.range[d] % granularity[d] == 0);
}
#endif

// Factorize num_chunks
// We start out with an initial guess of `factor = floor(sqrt(num_chunks))` (the other one is implicitly given by `num_chunks / factor`),
// and work our way down, keeping track of the best factorization we've found so far, until we find a factorization that produces
// the requested number of chunks, or until we reach (1, num_chunks), i.e., a 1D split.
size_t factor = std::floor(std::sqrt(num_chunks));
std::array<size_t, 2> best_chunk_counts = {0, 0};
while(factor >= 1) {
while(factor > 1 && num_chunks % factor != 0) {
factor--;
}
// The returned counts are at most (factor, num_chunks / factor), however may be less if constrained by the split granularity.
const auto chunk_counts = assign_split_factors_2d(full_chunk, granularity, factor, num_chunks);
if(chunk_counts[0] * chunk_counts[1] > best_chunk_counts[0] * best_chunk_counts[1]) { best_chunk_counts = chunk_counts; }
if(chunk_counts[0] * chunk_counts[1] == num_chunks) { break; }
factor--;
}
const auto actual_num_chunks = best_chunk_counts;
const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<2>(full_chunk, granularity, actual_num_chunks);

std::vector<chunk<3>> result(actual_num_chunks[0] * actual_num_chunks[1], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
id<3> offset = full_chunk.offset;

for(size_t j = 0; j < actual_num_chunks[0]; ++j) {
range<2> chunk_size = {(j < num_large_chunks[0]) ? large_chunk_size[0] : small_chunk_size[0], 0};
for(size_t i = 0; i < actual_num_chunks[1]; ++i) {
chunk_size[1] = (i < num_large_chunks[1]) ? large_chunk_size[1] : small_chunk_size[1];
auto& chnk = result[j * actual_num_chunks[1] + i];
chnk.offset = offset;
chnk.range[0] = chunk_size[0];
chnk.range[1] = chunk_size[1];
offset[1] += chunk_size[1];
}
offset[0] += chunk_size[0];
offset[1] = full_chunk.offset[1];
}

#ifndef NDEBUG
sanity_check_split(full_chunk, result);
#endif

return result;
}
} // namespace celerity::detail
1 change: 1 addition & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ set(TEST_TARGETS
runtime_tests
runtime_deprecation_tests
sycl_tests
split_tests
task_graph_tests
task_ring_buffer_tests
test_utils_tests
Expand Down
10 changes: 10 additions & 0 deletions test/graph_gen_granularity_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,16 @@ TEST_CASE("distributed_graph_generator respects split constraints", "[distribute
CHECK(dynamic_cast<const execution_command*>(dctx.query(tid_b).get_raw(1)[0])->get_execution_range().range == range<3>{96, 1, 1});
}

TEST_CASE("distributed_graph_generator creates 2-dimensional chunks when providing the split_2d hint", "[distributed_graph_generator][split][task-hints]") {
const size_t num_nodes = 4;
dist_cdag_test_context dctx(num_nodes);
const auto tid_a = dctx.device_compute<class UKN(task)>(range<2>{128, 128}).hint(experimental::hints::split_2d{}).submit();
REQUIRE(dctx.query(tid_a).count() == 4);
for(node_id nid = 0; nid < 4; ++nid) {
CHECK(dynamic_cast<const execution_command*>(dctx.query(tid_a).get_raw(nid)[0])->get_execution_range().range == range<3>{64, 64, 1});
}
}

template <int Dims>
class simple_task;

Expand Down
17 changes: 17 additions & 0 deletions test/hint_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,20 @@ TEST_CASE_METHOD(test_utils::runtime_fixture, "hints can ensure combinations wit
CHECK_THROWS_WITH(experimental::hint(cgh, my_hint{1336}), "not leet enough");
});
}

TEST_CASE_METHOD(test_utils::runtime_fixture, "split_1d and split_2d hints cannot be combined", "[task-hints]") {
celerity::runtime::init(nullptr, nullptr);
auto& tm = detail::runtime::get_instance().get_task_manager();
SECTION("1d then 2d") {
test_utils::add_compute_task<class UKN(hint_task)>(tm, [&](handler& cgh) {
CHECK_NOTHROW(experimental::hint(cgh, experimental::hints::split_1d{}));
CHECK_THROWS_WITH(experimental::hint(cgh, experimental::hints::split_2d{}), "Cannot combine split_1d and split_2d hints");
});
}
SECTION("2d then 1d") {
test_utils::add_compute_task<class UKN(hint_task)>(tm, [&](handler& cgh) {
CHECK_NOTHROW(experimental::hint(cgh, experimental::hints::split_2d{}));
CHECK_THROWS_WITH(experimental::hint(cgh, experimental::hints::split_1d{}), "Cannot combine split_1d and split_2d hints");
});
}
}
Loading

0 comments on commit 404e325

Please sign in to comment.