From cf3c531e012c9c0ddc87485cab0ae3659ab976ae Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 20 Jan 2025 03:18:37 -0800 Subject: [PATCH 1/3] Host Ir: add linear op with preallocated outputs --- csrc/host_ir/executor.cpp | 55 +++++++++++++++++++++++++ csrc/host_ir/executor.h | 1 + csrc/ir/internal_nodes.h | 1 - tests/cpp/test_host_irs.cpp | 82 +++++++++++++++++++++++++++++++++++++ 4 files changed, 138 insertions(+), 1 deletion(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 0f9f3da6921..102ae5feb3e 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -545,6 +545,61 @@ void HostIrEvaluator::handle(MatmulOp* matmul) { } } +void HostIrEvaluator::handle(LinearOp* linear) { + TensorView* in = linear->inA()->as(); + TensorView* weight = linear->inB()->as(); + TensorView* bias = linear->bias()->as(); + TensorView* out = linear->out()->as(); + NVF_ERROR( + expr_evaluator_.isKnown(in) + && expr_evaluator_.isKnown(weight) + && (!linear->has_bias() || expr_evaluator_.isKnown(bias)), + "Inputs of the Linear Op ", + linear->toString(), + "must be precomputed before being retrieved"); + + if (!expr_evaluator_.isKnown(out)) { + unhandled(linear); + return; + } + + auto squeeze_device_dims = [](at::Tensor& t, + int64_t num_device_dims) -> void { + // Record the initial shape for the error message. + std::vector shape = t.sizes().vec(); + for ([[maybe_unused]] auto _ : c10::irange(num_device_dims)) { + NVF_CHECK( + t.size(0) == 1, + "When the weight is >2D, expect its preceding dimensions and " + "the bias's preceding dimensions to " + "be DID-parallel and therefore size-1: ", + shape); + t = t.squeeze(0); + } + }; + + auto in_at = expr_evaluator_.evaluate(in).as(); + auto weight_at = expr_evaluator_.evaluate(weight).as(); + auto bias_at = expr_evaluator_.evaluate(bias).as(); + auto out_at = expr_evaluator_.evaluate(out).as(); + + // The squeezes and unsqueezes are currently required to support a sharded + // linear layer. Remove them after #2563. + auto num_device_dims = weight_at.dim() - 2; + squeeze_device_dims(weight_at, num_device_dims); + if (linear->has_bias()) { + squeeze_device_dims(bias_at, num_device_dims); + at::linear_out(out_at, in_at, weight_at, bias_at); + } else { + at::linear_out(out_at, in_at, weight_at); + } + + for ([[maybe_unused]] auto _ : c10::irange(num_device_dims)) { + out_at = out_at.unsqueeze(0); + } + expr_evaluator_.bind(out, out_at, /*evaluate_validate=*/false); +} + void HostIrEvaluator::handle(kir::Allocate* allocate) { NVF_ERROR( allocate->buffer()->isA(), diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index 2797948975a..ad3e8422ca1 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -127,6 +127,7 @@ class HostIrEvaluator final : public OptOutDispatch { void handle(EndCoalescing* end_coalescing) override; void handle(kir::IfThenElse* if_then_else) override; void handle(MatmulOp* matmul) override; + void handle(LinearOp* linear) override; void handle(kir::Allocate* allocate) override; void unhandled(Statement* stmt) override; diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h index 6aebcb3c457..ff2e29b04af 100644 --- a/csrc/ir/internal_nodes.h +++ b/csrc/ir/internal_nodes.h @@ -2269,7 +2269,6 @@ class LinearOp : public Expr { const ExpressionEvaluator& ee, const std::vector& inputs) const override; - private: bool has_bias() const { return inputs().size() == 3; } diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index e97550309e1..54832acc8ac 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -849,6 +849,88 @@ TEST_F(MatmulHostIrTest, HostIrMatmulOut) { EXPECT_TRUE(ref_output.allclose(c_tensor)); } +using LinearHostIrTest = NVFuserTest; + +TEST_F(LinearHostIrTest, HostIr) { + constexpr int64_t B = 32; + constexpr int64_t M = 64; + constexpr int64_t K = 128; + constexpr int64_t N = 256; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + TensorView* in = makeContigTensor(3); + TensorView* weight = makeContigTensor(2); + TensorView* bias = makeContigTensor(1); + TensorView* out = linear(in, weight, bias); + + hic->addInput(in); + hic->addInput(weight); + hic->addInput(bias); + hic->addOutput(out); + + hic->pushBackTopLevelExprs(out->definition()); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat); + at::Tensor in_at = at::randn({B, M, K}, options); + at::Tensor weight_at = at::randn({N, K}, options); + at::Tensor bias_at = at::randn({N}, options); + std::unordered_map concrete_input_buffers = { + {hie.inputs().at(0), in_at}, {hie.inputs().at(1), weight_at}, {hie.inputs().at(2), bias_at}}; + + auto output = hie.runWithInput(concrete_input_buffers).at(0); + + // validate + auto ref_output = at::linear(in_at, weight_at, bias_at); + + EXPECT_TRUE(ref_output.allclose(output)); +} + +TEST_F(LinearHostIrTest, HostIrLinearOut) { + constexpr int64_t B = 32; + constexpr int64_t M = 64; + constexpr int64_t K = 128; + constexpr int64_t N = 256; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + TensorView* in = makeContigTensor(3); + TensorView* weight = makeContigTensor(2); + TensorView* bias = makeContigTensor(1); + TensorView* out = makeContigTensor(3); + + auto linear_op = IrBuilder::create(out, in, weight, bias); + + hic->addInput(in); + hic->addInput(weight); + hic->addInput(bias); + hic->addInput(out); + hic->addOutput(out); + + hic->pushBackTopLevelExprs(linear_op); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat); + at::Tensor in_at = at::randn({B, M, K}, options); + at::Tensor weight_at = at::randn({N, K}, options); + at::Tensor bias_at = at::randn({N}, options); + at::Tensor out_at = at::empty({B, M, N}, options); + std::unordered_map concrete_input_buffers = { + {hie.inputs().at(0), in_at}, {hie.inputs().at(1), weight_at}, {hie.inputs().at(2), bias_at}, {hie.inputs().at(3), out_at}}; + + hie.runWithInput(concrete_input_buffers); + + // validate + auto ref_output = at::linear(in_at, weight_at, bias_at); + + EXPECT_TRUE(ref_output.allclose(out_at)); +} + using SelectHostIrTestParams = bool; using SelectHostIrTest = NVFuserFixtureParamTest; From 9e179334edf213686718fe3db8040804ee642e46 Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 20 Jan 2025 03:23:22 -0800 Subject: [PATCH 2/3] slightly simplify implementation and test --- csrc/host_ir/executor.cpp | 29 ++--------------------------- tests/cpp/test_host_irs.cpp | 1 - 2 files changed, 2 insertions(+), 28 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 102ae5feb3e..f943ccc6927 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -563,41 +563,16 @@ void HostIrEvaluator::handle(LinearOp* linear) { return; } - auto squeeze_device_dims = [](at::Tensor& t, - int64_t num_device_dims) -> void { - // Record the initial shape for the error message. - std::vector shape = t.sizes().vec(); - for ([[maybe_unused]] auto _ : c10::irange(num_device_dims)) { - NVF_CHECK( - t.size(0) == 1, - "When the weight is >2D, expect its preceding dimensions and " - "the bias's preceding dimensions to " - "be DID-parallel and therefore size-1: ", - shape); - t = t.squeeze(0); - } - }; - auto in_at = expr_evaluator_.evaluate(in).as(); auto weight_at = expr_evaluator_.evaluate(weight).as(); auto bias_at = expr_evaluator_.evaluate(bias).as(); auto out_at = expr_evaluator_.evaluate(out).as(); - // The squeezes and unsqueezes are currently required to support a sharded - // linear layer. Remove them after #2563. - auto num_device_dims = weight_at.dim() - 2; - squeeze_device_dims(weight_at, num_device_dims); if (linear->has_bias()) { - squeeze_device_dims(bias_at, num_device_dims); - at::linear_out(out_at, in_at, weight_at, bias_at); + at::linear_out(out_at, in_at, weight_at.squeeze(), bias_at.squeeze()); } else { - at::linear_out(out_at, in_at, weight_at); - } - - for ([[maybe_unused]] auto _ : c10::irange(num_device_dims)) { - out_at = out_at.unsqueeze(0); + at::linear_out(out_at, in_at, weight_at.squeeze()); } - expr_evaluator_.bind(out, out_at, /*evaluate_validate=*/false); } void HostIrEvaluator::handle(kir::Allocate* allocate) { diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index 54832acc8ac..687072e172c 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -909,7 +909,6 @@ TEST_F(LinearHostIrTest, HostIrLinearOut) { hic->addInput(weight); hic->addInput(bias); hic->addInput(out); - hic->addOutput(out); hic->pushBackTopLevelExprs(linear_op); From d4326626caf57ff9b0f65d1bb4d433fcc982381f Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 20 Jan 2025 03:24:06 -0800 Subject: [PATCH 3/3] lint --- csrc/host_ir/executor.cpp | 5 ++--- tests/cpp/test_host_irs.cpp | 9 +++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index f943ccc6927..170f4fcc6dc 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -551,9 +551,8 @@ void HostIrEvaluator::handle(LinearOp* linear) { TensorView* bias = linear->bias()->as(); TensorView* out = linear->out()->as(); NVF_ERROR( - expr_evaluator_.isKnown(in) - && expr_evaluator_.isKnown(weight) - && (!linear->has_bias() || expr_evaluator_.isKnown(bias)), + expr_evaluator_.isKnown(in) && expr_evaluator_.isKnown(weight) && + (!linear->has_bias() || expr_evaluator_.isKnown(bias)), "Inputs of the Linear Op ", linear->toString(), "must be precomputed before being retrieved"); diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index 687072e172c..e0f41c70a91 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -879,7 +879,9 @@ TEST_F(LinearHostIrTest, HostIr) { at::Tensor weight_at = at::randn({N, K}, options); at::Tensor bias_at = at::randn({N}, options); std::unordered_map concrete_input_buffers = { - {hie.inputs().at(0), in_at}, {hie.inputs().at(1), weight_at}, {hie.inputs().at(2), bias_at}}; + {hie.inputs().at(0), in_at}, + {hie.inputs().at(1), weight_at}, + {hie.inputs().at(2), bias_at}}; auto output = hie.runWithInput(concrete_input_buffers).at(0); @@ -920,7 +922,10 @@ TEST_F(LinearHostIrTest, HostIrLinearOut) { at::Tensor bias_at = at::randn({N}, options); at::Tensor out_at = at::empty({B, M, N}, options); std::unordered_map concrete_input_buffers = { - {hie.inputs().at(0), in_at}, {hie.inputs().at(1), weight_at}, {hie.inputs().at(2), bias_at}, {hie.inputs().at(3), out_at}}; + {hie.inputs().at(0), in_at}, + {hie.inputs().at(1), weight_at}, + {hie.inputs().at(2), bias_at}, + {hie.inputs().at(3), out_at}}; hie.runWithInput(concrete_input_buffers);