Merge remote-tracking branch 'origin/main' into hopper_warptile_split

NVIDIA · Jan 17, 2025 · 41e2b94 · 41e2b94
2 parents 9de3202 + bf66a0c
commit 41e2b94
Show file tree

Hide file tree

Showing 91 changed files with 4,431 additions and 489 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -16,7 +16,7 @@ env:
 
 jobs:
   clang-build:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
         with:
@@ -37,7 +37,7 @@ jobs:
           python setup.py build
 
   dynamic-type-meson:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
         with:

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -15,7 +15,7 @@ env:
 
 jobs:
   check-license:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
         with:
@@ -28,7 +28,7 @@ jobs:
           test ! -s missing-header-files.txt
 
   clang-tidy:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
         with:
@@ -72,7 +72,7 @@ jobs:
           git --no-pager diff --diff-filter=d --name-only $head_commit | grep -e "csrc/.*\.cpp" -e "csrc/.*\.h" | xargs lintrunner --take CLANGTIDY --force-color
 
   lintrunner:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
         with:

diff --git a/.github/workflows/nvfuser-ci-trigger.yml b/.github/workflows/nvfuser-ci-trigger.yml
@@ -15,9 +15,34 @@ jobs:
       args: ${{ env.args }}
 
     # This job only runs for pull request comments
-    if: |
-         ( startsWith(github.event.comment.body, '!build') || startsWith(github.event.comment.body, '!test') ) &&
-         (github.actor == 'xwang233' || github.actor == 'jjsjann123' || github.actor == 'chang-l' || github.actor == 'csarofeen' || github.actor == 'drzejan2' || github.actor == 'IvanYashchuk' || github.actor == 'jacobhinkle' || github.actor == 'kevinstephano' || github.actor == 'liqiangxl' || github.actor == 'mmigdal-nv' || github.actor == 'naoyam' || github.actor == 'ptrblck' || github.actor == 'rdspring1' || github.actor == 'samnordmann' || github.actor == 'zasdfgbnm' || github.actor == 'crcrpar' || github.actor == 'nWEIdia' || github.actor == 'Priya2698' || github.actor == 'wujingyue' || github.actor == 'tfogal' || github.actor == 'protonu' || github.actor == 'cowanmeg' || github.actor == 'nsarka')
+    if: >-
+          ( startsWith(github.event.comment.body, '!build') ||
+            startsWith(github.event.comment.body, '!test')
+          ) &&
+          ( github.actor == 'xwang233' || 
+            github.actor == 'jjsjann123' || 
+            github.actor == 'chang-l' || 
+            github.actor == 'csarofeen' || 
+            github.actor == 'drzejan2' || 
+            github.actor == 'IvanYashchuk' || 
+            github.actor == 'jacobhinkle' || 
+            github.actor == 'kevinstephano' || 
+            github.actor == 'liqiangxl' || 
+            github.actor == 'mmigdal-nv' || 
+            github.actor == 'naoyam' || 
+            github.actor == 'ptrblck' || 
+            github.actor == 'rdspring1' || 
+            github.actor == 'samnordmann' || 
+            github.actor == 'zasdfgbnm' || 
+            github.actor == 'crcrpar' || 
+            github.actor == 'nWEIdia' || 
+            github.actor == 'Priya2698' || 
+            github.actor == 'wujingyue' || 
+            github.actor == 'tfogal' || 
+            github.actor == 'protonu' || 
+            github.actor == 'cowanmeg' || 
+            github.actor == 'nsarka'
+          )
     steps:
       - name: Check if comment is issued by authorized person
         run: blossom-ci

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -7,6 +7,10 @@ name: pull
 on:
   pull_request:
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.run_id}}
+  cancel-in-progress: true
+
 run-name: CI status hello ${{ github.event.pull_request.number }} - ${{ github.event.pull_request.head.sha }}
 jobs:
   status_hello:
@@ -23,3 +27,47 @@ jobs:
           -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
           https://api.github.com/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha }} \
           -d "{\"state\":\"success\",\"target_url\":\"https://github.com/NVIDIA/Fuser/wiki/Bot-Commands\",\"description\":\"Authorized users: comment !build or !test to trigger CI pipelines. See wiki.\",\"context\":\"CI notes\"}"
+
+  pr-agent-tools:
+    name: PR Agent tools
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      issues: write
+      packages: read
+    container:
+      image: ghcr.io/nvidia/fuser:ci-llm-workflow
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      env:
+        GITHUB__USER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        CONFIG__PUBLISH_OUTPUT: true
+
+        OPENAI__KEY: ${{ secrets.LLM_OPENAI__KEY }}
+        OPENAI__API_BASE: ${{ secrets.LLM_OPENAI__API_BASE }}
+        CONFIG__MODEL: ${{ secrets.LLM_CONFIG__MODEL }}
+        CONFIG__CUSTOM_MODEL_MAX_TOKENS: 131072
+
+        CONFIG__MAX_MODEL_TOKENS: 65536
+        CONFIG__PUBLISH_OUTPUT_PROGRESS: false 
+
+        PR_REVIEWER__REQUIRE_SCORE_REVIEW: false
+        PR_REVIEWER__REQUIRE_TESTS_REVIEW: true
+        PR_REVIEWER__REQUIRE_ESTIMATE_EFFORT_TO_REVIEW: true
+        PR_REVIEWER__REQUIRE_CAN_BE_SPLIT_REVIEW: false
+        PR_REVIEWER__REQUIRE_SECURITY_REVIEW: false
+        PR_REVIEWER__REQUIRE_TICKET_ANALYSIS_REVIEW: false
+
+        PR_REVIEWER__ENABLE_REVIEW_LABELS_EFFORT: false
+        PR_REVIEWER__ENABLE_REVIEW_LABELS_SECURITY: false
+
+        PR_REVIEWER__PERSISTENT_COMMENT: true
+        PR_REVIEWER__FINAL_UPDATE_MESSAGE: false
+
+        PR_REVIEWER__EXTRA_INSTRUCTIONS: |
+          Focus on potential logic change, especially on changes to function signatures.
+
+    steps:
+      - name: PR Agent review
+        run: python /app/pr_agent/cli.py --pr_url ${{ github.event.pull_request.html_url }} review
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -240,6 +240,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/scheduler/tools/loop_domain_scheduler.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/tools/maxinfo_propagator.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/tools/resize_utils.cpp
+  ${NVFUSER_SRCS_DIR}/scheduler/tools/static_repeat.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/transpose.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/utils.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/vectorize_helper.cpp
@@ -446,6 +447,7 @@ if(BUILD_PYTHON)
   # nvfuser python API sources
   set(NVFUSER_PYTHON_SRCS)
   list(APPEND NVFUSER_PYTHON_SRCS
+    ${NVFUSER_SRCS_DIR}/python_frontend/communicator_bindings.cpp
     ${NVFUSER_SRCS_DIR}/python_frontend/python_bindings.cpp
     ${NVFUSER_SRCS_DIR}/python_frontend/python_bindings_extension.cpp
     ${NVFUSER_SRCS_DIR}/python_frontend/schedule_bindings.cpp
@@ -698,7 +700,12 @@ if(BUILD_TEST)
   add_test(tutorial "${NVFUSER_ROOT}/tests/cpp/test_tutorial.cpp" "")
   list(APPEND TEST_BINARIES tutorial)
 
-  add_test(test_host_ir "${NVFUSER_ROOT}/tests/cpp/test_host_irs.cpp" "")
+  set(HOSTIR_TEST_SRCS)
+  list(APPEND HOSTIR_TEST_SRCS
+    ${NVFUSER_ROOT}/tests/cpp/test_host_irs.cpp
+    ${NVFUSER_ROOT}/tests/cpp/test_host_ir_integration.cpp
+  )
+  add_test(test_host_ir "${HOSTIR_TEST_SRCS}" "")
   list(APPEND TEST_BINARIES test_host_ir)
 
   if(BUILD_PYTHON)

diff --git a/benchmarks/cpp/utils.cpp b/benchmarks/cpp/utils.cpp
@@ -190,14 +190,14 @@ int64_t runBenchmarkIterations(
               ->groups()
               .size() > 1;
 
-  const auto& compile_log = executor_cache->getMostRecentExecutorInfo();
-  auto params = toString(compile_log.params);
-  auto lparams = toString(
-      compile_log.fusion_executor->as<KernelExecutor>()->lastLaunchParams());
   // Only set if not segmented. In the case of segmented fusions,
   // this could be confusing as the log would refect only the last
   // segment. Revisit if necessary.
   if (!segmented) {
+    const auto& compile_log = executor_cache->getMostRecentExecutorInfo();
+    auto params = toString(compile_log.params);
+    auto lparams = toString(
+        compile_log.fusion_executor->as<KernelExecutor>()->lastLaunchParams());
     benchmark_state.SetLabel(params + lparams);
   }
 

diff --git a/benchmarks/python/conftest.py b/benchmarks/python/conftest.py
@@ -32,6 +32,11 @@ def pytest_addoption(parser):
         action="store_true",
         help="Benchmarks torch.compile mode.",
     )
+    parser.addoption(
+        "--benchmark-thunder-torchcompile",
+        action="store_true",
+        help="Benchmarks torch.compile mode.",
+    )
 
     # pytest-benchmark does not have CLI options to set rounds/warmup_rounds for benchmark.pedantic.
     # The following two options are used to overwrite the default values through CLI.
@@ -104,14 +109,14 @@ def pytest_collection_modifyitems(session, config, items):
 
     from nvfuser.pytorch_utils import retry_on_oom_or_skip_test
 
-    executors = ["eager", "torchcompile", "thunder"]
+    executors = ["eager", "torchcompile", "thunder", "thunder-torchcompile"]
 
     def get_test_executor(item) -> str | None:
         if hasattr(item, "callspec") and "executor" in item.callspec.params:
             test_executor = item.callspec.params["executor"]
             assert (
                 test_executor in executors
-            ), f"Expected executor to be one of 'eager', 'torchcompile', 'thunder', found {test_executor}."
+            ), f"Expected executor to be one of 'eager', 'torchcompile', 'thunder', 'thunder-torchcompile', found {test_executor}."
             return test_executor
         return None
 

diff --git a/benchmarks/python/core.py b/benchmarks/python/core.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-present NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
+from collections.abc import Iterable
 import pytest_benchmark
 import torch
 from torch.autograd import DeviceType
@@ -47,14 +48,18 @@ def unary_bwd_torch(inputs: List):  # [output, grad_out]
     inputs[0].backward(inputs[1], retain_graph=True)
 
 
-def with_executor(executor: str, fwd_fn: Callable) -> Callable:
-    assert executor in ["eager", "torchcompile", "thunder"]
+def with_executor(executor: str, fwd_fn: Callable, **kwargs) -> Callable:
+    assert executor in ["eager", "torchcompile", "thunder", "thunder-torchcompile"]
     if executor == "eager":
         return fwd_fn
     if executor == "torchcompile":
-        return torch.compile(fwd_fn)
+        return torch.compile(fwd_fn, **kwargs)
     if executor == "thunder":
-        return thunder.jit(fwd_fn, nv_enable_bookend=False, executors=[nvfuserex])
+        return thunder.jit(
+            fwd_fn, nv_enable_bookend=False, executors=[nvfuserex], **kwargs
+        )
+    if executor == "thunder-torchcompile":
+        return thunder.jit(fwd_fn, executors=["torchcompile"], **kwargs)
 
 
 def compute_total_iobytes(
@@ -221,9 +226,9 @@ def set_metrics(
             % Peak Bandwidth (SOL): 100 * Bandwidth /PEAK_BANDWIDTH
         """
         if not iobytes:
-            if isinstance(inputs, torch.Tensor):
+            if not isinstance(inputs, Iterable):
                 inputs = [inputs]
-            if isinstance(outputs, torch.Tensor):
+            if not isinstance(outputs, Iterable):
                 outputs = [outputs]
 
             iobytes = 0

diff --git a/benchmarks/python/normalization.py b/benchmarks/python/normalization.py
@@ -492,7 +492,6 @@ def norm_bwd_baseline_benchmark(
 
     norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn
 
-    # Compile the fwd fn for torchcompile
     fwd_fn = with_executor(executor, norm_fwd_fn)
     fwd_inputs = [inputs, weight, bias, running_mean, running_var]
     outputs = fwd_fn(fwd_inputs)