testing torchao autoquant [WIP] #3701
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: TorchBench GPU model stability test | |
on: | |
workflow_dispatch: | |
inputs: | |
model: | |
description: "Model Name" | |
required: true | |
default: "fastNLP_Bert" | |
pull_request: | |
jobs: | |
stability_test: | |
env: | |
CONDA_ENV: "stability-test-ci" | |
TEST_HOME: "/tmp/tb-stability-ci" | |
PYTHON_VERSION: "3.8" | |
CUDA_VERSION: "cu116" | |
PR_BODY: ${{ github.event.pull_request.body }} | |
MODEL: ${{ github.event.inputs.model }} | |
GPU_ID: "1" | |
GPU_FREQ: "5001,900" | |
REPEAT: "10" | |
if: ${{ (github.event.inputs.model || contains(github.event.pull_request.body, 'STABLE_TEST_MODEL:')) }} | |
runs-on: [self-hosted, bm-runner] | |
timeout-minutes: 120 # 2 hours | |
environment: docker-s3-upload | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v3 | |
- name: Create conda environment with pytorch nightly | |
run: | | |
conda create -y -n "${CONDA_ENV}" python="${PYTHON_VERSION}" | |
. activate "${CONDA_ENV}" | |
conda install -y numpy requests=2.22 ninja pyyaml mkl mkl-include setuptools \ | |
cmake cffi typing_extensions future six dataclasses tabulate gitpython | |
# Install pytorch nightly | |
pip install --pre torch torchvision torchaudio \ | |
-f https://download.pytorch.org/whl/nightly/${CUDA_VERSION}/torch_nightly.html | |
# Install torchbench dependencies | |
python install.py | |
- name: Stability test | |
run: | | |
. activate "${CONDA_ENV}" | |
mkdir -p "${TEST_HOME}" | |
if [ -z "${MODEL}" ] ; then | |
# Load PR to file | |
PR_BODY_FILE="${TEST_HOME}"/pr-body.txt | |
echo "${PR_BODY}" > "${PR_BODY_FILE}" | |
MODEL=`python ./.github/scripts/test-repeated-runs.py --pr-body "${PR_BODY_FILE}"` | |
fi | |
# Setup nvidia gpu frequency | |
sudo nvidia-persistenced --user "${USER}" || true | |
sudo nvidia-smi -pm "${GPU_ID}" | |
sudo nvidia-smi -ac "${GPU_FREQ}" | |
# Run the tests | |
EVAL_LOG="${TEST_HOME}/eval-${MODEL}.log" | |
echo -n > "${EVAL_LOG}" | |
for i in `seq 1 ${REPEAT}`; do | |
python run.py "${MODEL}" -t eval -d cuda | tee -a "${EVAL_LOG}" | |
done | |
TRAIN_LOG="${TEST_HOME}/train-${MODEL}.log" | |
echo -n > "${TRAIN_LOG}" | |
for i in `seq 1 ${REPEAT}`; do | |
python run.py "${MODEL}" -t train -d cuda | tee -a "${TRAIN_LOG}" | |
done | |
# Check the stability of GPU tests | |
python ./.github/scripts/test-repeated-runs.py --log "${EVAL_LOG}" && \ | |
echo "GPU stability test pass for inference!" | |
python ./.github/scripts/test-repeated-runs.py --log "${TRAIN_LOG}" && \ | |
echo "GPU stability test pass for train!" | |
- name: Remove conda environment | |
run: | | |
conda env remove --name "${CONDA_ENV}" | |