From cf3c531e012c9c0ddc87485cab0ae3659ab976ae Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 20 Jan 2025 03:18:37 -0800
Subject: [PATCH 1/3] Host Ir: add linear op with preallocated outputs

---
 csrc/host_ir/executor.cpp   | 55 +++++++++++++++++++++++++
 csrc/host_ir/executor.h     |  1 +
 csrc/ir/internal_nodes.h    |  1 -
 tests/cpp/test_host_irs.cpp | 82 +++++++++++++++++++++++++++++++++++++
 4 files changed, 138 insertions(+), 1 deletion(-)
diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 0f9f3da6921..102ae5feb3e 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -545,6 +545,61 @@ void HostIrEvaluator::handle(MatmulOp* matmul) {
   }
 }
 
+void HostIrEvaluator::handle(LinearOp* linear) {
+  TensorView* in = linear->inA()->as<TensorView>();
+  TensorView* weight = linear->inB()->as<TensorView>();
+  TensorView* bias = linear->bias()->as<TensorView>();
+  TensorView* out = linear->out()->as<TensorView>();
+  NVF_ERROR(
+      expr_evaluator_.isKnown(in)
+        && expr_evaluator_.isKnown(weight)
+        && (!linear->has_bias() || expr_evaluator_.isKnown(bias)),
+      "Inputs of the Linear Op ",
+      linear->toString(),
+      "must be precomputed before being retrieved");
+
+  if (!expr_evaluator_.isKnown(out)) {
+    unhandled(linear);
+    return;
+  }
+
+  auto squeeze_device_dims = [](at::Tensor& t,
+                              int64_t num_device_dims) -> void {
+    // Record the initial shape for the error message.
+    std::vector<int64_t> shape = t.sizes().vec();
+    for ([[maybe_unused]] auto _ : c10::irange(num_device_dims)) {
+      NVF_CHECK(
+          t.size(0) == 1,
+          "When the weight is >2D, expect its preceding dimensions and "
+          "the bias's preceding dimensions to "
+          "be DID-parallel and therefore size-1: ",
+          shape);
+      t = t.squeeze(0);
+    }
+  };
+
+  auto in_at = expr_evaluator_.evaluate(in).as<at::Tensor>();
+  auto weight_at = expr_evaluator_.evaluate(weight).as<at::Tensor>();
+  auto bias_at = expr_evaluator_.evaluate(bias).as<at::Tensor>();
+  auto out_at = expr_evaluator_.evaluate(out).as<at::Tensor>();
+
+  // The squeezes and unsqueezes are currently required to support a sharded
+  // linear layer. Remove them after #2563.
+  auto num_device_dims = weight_at.dim() - 2;
+  squeeze_device_dims(weight_at, num_device_dims);
+  if (linear->has_bias()) {
+    squeeze_device_dims(bias_at, num_device_dims);
+    at::linear_out(out_at, in_at, weight_at, bias_at);
+  } else {
+    at::linear_out(out_at, in_at, weight_at);
+  }
+
+  for ([[maybe_unused]] auto _ : c10::irange(num_device_dims)) {
+    out_at = out_at.unsqueeze(0);
+  }
+  expr_evaluator_.bind(out, out_at, /*evaluate_validate=*/false);
+}
+
 void HostIrEvaluator::handle(kir::Allocate* allocate) {
   NVF_ERROR(
       allocate->buffer()->isA<TensorView>(),
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index 2797948975a..ad3e8422ca1 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -127,6 +127,7 @@ class HostIrEvaluator final : public OptOutDispatch {
   void handle(EndCoalescing* end_coalescing) override;
   void handle(kir::IfThenElse* if_then_else) override;
   void handle(MatmulOp* matmul) override;
+  void handle(LinearOp* linear) override;
   void handle(kir::Allocate* allocate) override;
   void unhandled(Statement* stmt) override;
 
diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h
index 6aebcb3c457..ff2e29b04af 100644
--- a/csrc/ir/internal_nodes.h
+++ b/csrc/ir/internal_nodes.h
@@ -2269,7 +2269,6 @@ class LinearOp : public Expr {
       const ExpressionEvaluator& ee,
       const std::vector<PolymorphicValue>& inputs) const override;
 
- private:
   bool has_bias() const {
     return inputs().size() == 3;
   }
diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp
index e97550309e1..54832acc8ac 100644
--- a/tests/cpp/test_host_irs.cpp
+++ b/tests/cpp/test_host_irs.cpp
@@ -849,6 +849,88 @@ TEST_F(MatmulHostIrTest, HostIrMatmulOut) {
   EXPECT_TRUE(ref_output.allclose(c_tensor));
 }
 
+using LinearHostIrTest = NVFuserTest;
+
+TEST_F(LinearHostIrTest, HostIr) {
+  constexpr int64_t B = 32;
+  constexpr int64_t M = 64;
+  constexpr int64_t K = 128;
+  constexpr int64_t N = 256;
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+
+  TensorView* in = makeContigTensor(3);
+  TensorView* weight = makeContigTensor(2);
+  TensorView* bias = makeContigTensor(1);
+  TensorView* out = linear(in, weight, bias);
+
+  hic->addInput(in);
+  hic->addInput(weight);
+  hic->addInput(bias);
+  hic->addOutput(out);
+
+  hic->pushBackTopLevelExprs(out->definition());
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat);
+  at::Tensor in_at = at::randn({B, M, K}, options);
+  at::Tensor weight_at = at::randn({N, K}, options);
+  at::Tensor bias_at = at::randn({N}, options);
+  std::unordered_map<Val*, c10::IValue> concrete_input_buffers = {
+      {hie.inputs().at(0), in_at}, {hie.inputs().at(1), weight_at}, {hie.inputs().at(2), bias_at}};
+
+  auto output = hie.runWithInput(concrete_input_buffers).at(0);
+
+  // validate
+  auto ref_output = at::linear(in_at, weight_at, bias_at);
+
+  EXPECT_TRUE(ref_output.allclose(output));
+}
+
+TEST_F(LinearHostIrTest, HostIrLinearOut) {
+  constexpr int64_t B = 32;
+  constexpr int64_t M = 64;
+  constexpr int64_t K = 128;
+  constexpr int64_t N = 256;
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+
+  TensorView* in = makeContigTensor(3);
+  TensorView* weight = makeContigTensor(2);
+  TensorView* bias = makeContigTensor(1);
+  TensorView* out = makeContigTensor(3);
+
+  auto linear_op = IrBuilder::create<LinearOp>(out, in, weight, bias);
+
+  hic->addInput(in);
+  hic->addInput(weight);
+  hic->addInput(bias);
+  hic->addInput(out);
+  hic->addOutput(out);
+
+  hic->pushBackTopLevelExprs(linear_op);
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat);
+  at::Tensor in_at = at::randn({B, M, K}, options);
+  at::Tensor weight_at = at::randn({N, K}, options);
+  at::Tensor bias_at = at::randn({N}, options);
+  at::Tensor out_at = at::empty({B, M, N}, options);
+  std::unordered_map<Val*, c10::IValue> concrete_input_buffers = {
+      {hie.inputs().at(0), in_at}, {hie.inputs().at(1), weight_at}, {hie.inputs().at(2), bias_at}, {hie.inputs().at(3), out_at}};
+
+  hie.runWithInput(concrete_input_buffers);
+
+  // validate
+  auto ref_output = at::linear(in_at, weight_at, bias_at);
+
+  EXPECT_TRUE(ref_output.allclose(out_at));
+}
+
 using SelectHostIrTestParams = bool;
 using SelectHostIrTest = NVFuserFixtureParamTest<SelectHostIrTestParams>;
 

From 9e179334edf213686718fe3db8040804ee642e46 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 20 Jan 2025 03:23:22 -0800
Subject: [PATCH 2/3] slightly simplify implementation and test

---
 csrc/host_ir/executor.cpp   | 29 ++---------------------------
 tests/cpp/test_host_irs.cpp |  1 -
 2 files changed, 2 insertions(+), 28 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 102ae5feb3e..f943ccc6927 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -563,41 +563,16 @@ void HostIrEvaluator::handle(LinearOp* linear) {
     return;
   }
 
-  auto squeeze_device_dims = [](at::Tensor& t,
-                              int64_t num_device_dims) -> void {
-    // Record the initial shape for the error message.
-    std::vector<int64_t> shape = t.sizes().vec();
-    for ([[maybe_unused]] auto _ : c10::irange(num_device_dims)) {
-      NVF_CHECK(
-          t.size(0) == 1,
-          "When the weight is >2D, expect its preceding dimensions and "
-          "the bias's preceding dimensions to "
-          "be DID-parallel and therefore size-1: ",
-          shape);
-      t = t.squeeze(0);
-    }
-  };
-
   auto in_at = expr_evaluator_.evaluate(in).as<at::Tensor>();
   auto weight_at = expr_evaluator_.evaluate(weight).as<at::Tensor>();
   auto bias_at = expr_evaluator_.evaluate(bias).as<at::Tensor>();
   auto out_at = expr_evaluator_.evaluate(out).as<at::Tensor>();
 
-  // The squeezes and unsqueezes are currently required to support a sharded
-  // linear layer. Remove them after #2563.
-  auto num_device_dims = weight_at.dim() - 2;
-  squeeze_device_dims(weight_at, num_device_dims);
   if (linear->has_bias()) {
-    squeeze_device_dims(bias_at, num_device_dims);
-    at::linear_out(out_at, in_at, weight_at, bias_at);
+    at::linear_out(out_at, in_at, weight_at.squeeze(), bias_at.squeeze());
   } else {
-    at::linear_out(out_at, in_at, weight_at);
-  }
-
-  for ([[maybe_unused]] auto _ : c10::irange(num_device_dims)) {
-    out_at = out_at.unsqueeze(0);
+    at::linear_out(out_at, in_at, weight_at.squeeze());
   }
-  expr_evaluator_.bind(out, out_at, /*evaluate_validate=*/false);
 }
 
 void HostIrEvaluator::handle(kir::Allocate* allocate) {
diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp
index 54832acc8ac..687072e172c 100644
--- a/tests/cpp/test_host_irs.cpp
+++ b/tests/cpp/test_host_irs.cpp
@@ -909,7 +909,6 @@ TEST_F(LinearHostIrTest, HostIrLinearOut) {
   hic->addInput(weight);
   hic->addInput(bias);
   hic->addInput(out);
-  hic->addOutput(out);
 
   hic->pushBackTopLevelExprs(linear_op);
 

From d4326626caf57ff9b0f65d1bb4d433fcc982381f Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 20 Jan 2025 03:24:06 -0800
Subject: [PATCH 3/3] lint

---
 csrc/host_ir/executor.cpp   | 5 ++---
 tests/cpp/test_host_irs.cpp | 9 +++++++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index f943ccc6927..170f4fcc6dc 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -551,9 +551,8 @@ void HostIrEvaluator::handle(LinearOp* linear) {
   TensorView* bias = linear->bias()->as<TensorView>();
   TensorView* out = linear->out()->as<TensorView>();
   NVF_ERROR(
-      expr_evaluator_.isKnown(in)
-        && expr_evaluator_.isKnown(weight)
-        && (!linear->has_bias() || expr_evaluator_.isKnown(bias)),
+      expr_evaluator_.isKnown(in) && expr_evaluator_.isKnown(weight) &&
+          (!linear->has_bias() || expr_evaluator_.isKnown(bias)),
       "Inputs of the Linear Op ",
       linear->toString(),
       "must be precomputed before being retrieved");
diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp
index 687072e172c..e0f41c70a91 100644
--- a/tests/cpp/test_host_irs.cpp
+++ b/tests/cpp/test_host_irs.cpp
@@ -879,7 +879,9 @@ TEST_F(LinearHostIrTest, HostIr) {
   at::Tensor weight_at = at::randn({N, K}, options);
   at::Tensor bias_at = at::randn({N}, options);
   std::unordered_map<Val*, c10::IValue> concrete_input_buffers = {
-      {hie.inputs().at(0), in_at}, {hie.inputs().at(1), weight_at}, {hie.inputs().at(2), bias_at}};
+      {hie.inputs().at(0), in_at},
+      {hie.inputs().at(1), weight_at},
+      {hie.inputs().at(2), bias_at}};
 
   auto output = hie.runWithInput(concrete_input_buffers).at(0);
 
@@ -920,7 +922,10 @@ TEST_F(LinearHostIrTest, HostIrLinearOut) {
   at::Tensor bias_at = at::randn({N}, options);
   at::Tensor out_at = at::empty({B, M, N}, options);
   std::unordered_map<Val*, c10::IValue> concrete_input_buffers = {
-      {hie.inputs().at(0), in_at}, {hie.inputs().at(1), weight_at}, {hie.inputs().at(2), bias_at}, {hie.inputs().at(3), out_at}};
+      {hie.inputs().at(0), in_at},
+      {hie.inputs().at(1), weight_at},
+      {hie.inputs().at(2), bias_at},
+      {hie.inputs().at(3), out_at}};
 
   hie.runWithInput(concrete_input_buffers);