From ca70ad7f70251294adc38e0b71df6b83c5fe8c05 Mon Sep 17 00:00:00 2001 From: "Wang, Xiao" <24860335+xwang233@users.noreply.github.com> Date: Wed, 30 Oct 2024 15:37:45 -0700 Subject: [PATCH 1/2] Update !build and !test triggers for CI pipelines; add a CI hello message for pull requests (#3315) per title; wiki and backend trigger will be updated later --- .github/workflows/nvfuser-ci-trigger.yml | 2 +- .github/workflows/pull.yml | 25 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/pull.yml diff --git a/.github/workflows/nvfuser-ci-trigger.yml b/.github/workflows/nvfuser-ci-trigger.yml index c74472a881e..d4a79b95a9c 100644 --- a/.github/workflows/nvfuser-ci-trigger.yml +++ b/.github/workflows/nvfuser-ci-trigger.yml @@ -16,7 +16,7 @@ jobs: # This job only runs for pull request comments if: | - startsWith(github.event.comment.body, '!build') && + ( startsWith(github.event.comment.body, '!build') || startsWith(github.event.comment.body, '!test') ) && (github.actor == 'xwang233' || github.actor == 'jjsjann123' || github.actor == 'chang-l' || github.actor == 'csarofeen' || github.actor == 'drzejan2' || github.actor == 'IvanYashchuk' || github.actor == 'jacobhinkle' || github.actor == 'kevinstephano' || github.actor == 'liqiangxl' || github.actor == 'mmigdal-nv' || github.actor == 'naoyam' || github.actor == 'ptrblck' || github.actor == 'rdspring1' || github.actor == 'samnordmann' || github.actor == 'zasdfgbnm' || github.actor == 'crcrpar' || github.actor == 'nWEIdia' || github.actor == 'Priya2698' || github.actor == 'wujingyue' || github.actor == 'tfogal' || github.actor == 'protonu' || github.actor == 'cowanmeg' || github.actor == 'nsarka') steps: - name: Check if comment is issued by authorized person diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml new file mode 100644 index 00000000000..d27de8781ed --- /dev/null +++ b/.github/workflows/pull.yml @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause + +# A workflow to send CI-related helpful information to PRs +name: pull +on: + pull_request: + +run-name: CI status hello ${{ github.event.pull_request.number }} - ${{ github.event.pull_request.head.sha }} +jobs: + status_hello: + name: send CI hello status + runs-on: ubuntu-latest + permissions: + statuses: write + steps: + - name: Set CI hello status + run: | + curl \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ + https://api.github.com/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha }} \ + -d "{\"state\":\"success\",\"target_url\":\"https://github.com/NVIDIA/Fuser/wiki/Bot-Commands\",\"description\":\"Authorized users: comment !build or !test to trigger CI pipelines. See wiki.\",\"context\":\"CI notes\"}" From 3d9677de79b092101358863119bdef588ed3bccb Mon Sep 17 00:00:00 2001 From: Jacob Hinkle <1454944+jacobhinkle@users.noreply.github.com> Date: Wed, 30 Oct 2024 20:07:16 -0400 Subject: [PATCH 2/2] Add script to check for non-determinism (#3312) For example, try running the following commands: ```bash rm -rf /tmp/nvfuser_kernel_db tools/check_determinism.sh -- pytest -vs tests/python/test_ops.py::test_correctness_var_mean_float64 # This fails with a message like # 10845c10845 # < __global__ void nvfuser_inner_persistent_f7_c1_r0_g2(Tensor T0, Tensor T8, Tensor T7) { # --- # > __global__ void nvfuser_inner_persistent_f7_c1_r0_g2(Tensor T0, Tensor T7, Tensor T8) { # 10897c10897 # < T8[0] # --- # > T7[0] # 10923c10923 # < T7[0] # --- # > T8[0] # Diff of __tmp_kernel_inner_persistent_f7_c1_r0_g2.cu from rep 1 to rep 5 (above) is non-zero rm -rf /tmp/nvfuser_kernel_db export DEBUG_SERDE=disable tools/check_determinism.sh -- pytest -vs tests/python/test_ops.py::test_correctness_var_mean_float64 # This succeeds ``` Note that this script will delete any existing *.cu files in the current directory. --- tools/check_determinism.sh | 105 +++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100755 tools/check_determinism.sh diff --git a/tools/check_determinism.sh b/tools/check_determinism.sh new file mode 100755 index 00000000000..9b1d355afd5 --- /dev/null +++ b/tools/check_determinism.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +set -e +set -o pipefail + +usage() { + echo "Usage: $0 [-h] [-n NUMREPS=10] -- command to run]" +} + + +while getopts "n:h" arg +do + case $arg in + n) + NUMREPS=$OPTARG + shift + ;; + h | ?) + usage + exit 1 + ;; + esac +done +# getopts stops parsing if it sees "--". We can detect that case and record command +while [[ $# -gt 0 ]] +do + if [[ "$1" == "--" ]] + then + shift + break + fi + shift +done +CMD=$* + +export NVFUSER_DUMP=cuda_to_file + +KERNELDIR=$(mktemp -d) + +cleanup() { + rm -rf "$KERNELDIR" +} + +trap "cleanup" EXIT + +FIRSTREPDIR="$KERNELDIR/1" + +retval=0 +for rep in $(seq 1 "$NUMREPS") +do + NUMEXISTINGCUFILES=$(find . -maxdepth 1 -name \*.cu | wc -l) + if [[ $NUMEXISTINGCUFILES -ne 0 ]] + then + KERNELBACKUPDIR=./check_determinism-kernelbackup$(date +%Y%m%d.%H%M%S) + echo "Backing up $NUMEXISTINGCUFILES existing .cu files to $KERNELBACKUPDIR" + mkdir -p "$KERNELBACKUPDIR" + mv ./*.cu "$KERNELBACKUPDIR" + fi + # $CMD does not need to succeed for us to analyze it + set +e + $CMD + set -e + + REPDIR="$KERNELDIR/$rep" + mkdir -p "$REPDIR" + mv ./*.cu "$REPDIR/" + + NUMFIRST=$(find "$FIRSTREPDIR" -name \*.cu | wc -l) + NUMREP=$(find "$REPDIR" -name \*.cu | wc -l) + if [[ $NUMREP -ne $NUMFIRST ]] + then + echo "Created $NUMFIRST kernels on first repetition and $NUMREP on repetition $rep" + retval=1 + fi + for newkernel in "$REPDIR"/*.cu + do + basename=$(basename "$newkernel") + firstkernel="$FIRSTREPDIR/$basename" + if [ ! -f "$firstkernel" ] + then + echo "Kernel file $newkernel in repetition $rep does not exist in first repetition" + retval=1 + continue + fi + set +e + diff "$firstkernel" "$newkernel" + diffstatus=$? + set -e + if [[ $diffstatus -ne 0 ]] + then + printf 'Diff of %s from rep 1 to rep %d (above) is non-zero\n' "$basename" "$rep" + retval=1 + continue + fi + done + if [[ $retval -ne 0 ]] + then + # Stop repetitions after first failure + break + else + echo "Generated kernels all match" + fi +done + +exit $retval