diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index a35c6d42c4..ac9eaf6822 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -20,11 +20,6 @@ jobs: strategy: matrix: include: - - name: cpu-3.11-2.2 - container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04 - markers: not daily and (remote or not remote) and not gpu and not doctest - pytest_command: coverage run -m pytest - composer_package_name: mosaicml - name: cpu-3.11-2.3 container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 markers: not daily and (remote or not remote) and not gpu and not doctest @@ -35,8 +30,13 @@ jobs: markers: not daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml - - name: cpu-3.11-2.4-composer - container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 + - name: cpu-3.11-2.5 + container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04 + markers: not daily and (remote or not remote) and not gpu and not doctest + pytest_command: coverage run -m pytest + composer_package_name: mosaicml + - name: cpu-3.11-2.5-composer + container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04 markers: not daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: composer @@ -45,11 +45,6 @@ jobs: markers: not daily and (remote or not remote) and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py composer_package_name: mosaicml - - name: daily-cpu-3.11-2.2 - container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04 - markers: daily and (remote or not remote) and not gpu and not doctest - pytest_command: coverage run -m pytest - composer_package_name: mosaicml - name: daily-cpu-3.11-2.3 container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 markers: daily and (remote or not remote) and not gpu and not doctest @@ -60,13 +55,18 @@ jobs: markers: daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml - - name: daily-cpu-3.11-2.4-composer - container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 + - name: daily-cpu-3.11-2.5 + container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04 + markers: daily and (remote or not remote) and not gpu and not doctest + pytest_command: coverage run -m pytest + composer_package_name: mosaicml + - name: daily-cpu-3.11-2.5-composer + container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04 markers: daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: composer - name: daily-cpu-doctest - container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04 markers: daily and (remote or not remote) and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py composer_package_name: mosaicml @@ -107,12 +107,6 @@ jobs: include: # Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time # on MCLOUD and not eat up all GPUs at once - - name: "gpu-3.11-2.2-1-gpu" - container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 - markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" - pytest_command: "coverage run -m pytest" - composer_package_name: "mosaicml" - gpu_num: 1 - name: "gpu-3.11-2.3-1-gpu" container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" @@ -125,12 +119,12 @@ jobs: pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" gpu_num: 1 - - name: "gpu-3.11-2.2-2-gpu" - container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 + - name: "gpu-3.11-2.5-1-gpu" + container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" - gpu_num: 2 + gpu_num: 1 - name: "gpu-3.11-2.3-2-gpu" container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" @@ -143,12 +137,12 @@ jobs: pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" gpu_num: 2 - - name: "gpu-3.11-2.2-4-gpu" - container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 + - name: "gpu-3.11-2.5-2-gpu" + container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" - gpu_num: 4 + gpu_num: 2 - name: "gpu-3.11-2.3-4-gpu" container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" @@ -161,6 +155,12 @@ jobs: pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" gpu_num: 4 + - name: "gpu-3.11-2.5-4-gpu" + container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04 + markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" + pytest_command: "coverage run -m pytest" + composer_package_name: "mosaicml" + gpu_num: 4 steps: - name: Checkout code uses: actions/checkout@v3 diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 38ebe9d2c7..755e85ad00 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -16,10 +16,6 @@ jobs: strategy: matrix: include: - - name: cpu-3.11-2.2 - container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04 - markers: not daily and not remote and not gpu and not doctest - pytest_command: coverage run -m pytest - name: cpu-3.11-2.3 container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and not doctest @@ -28,8 +24,12 @@ jobs: container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest + - name: cpu-3.11-2.5 + container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04 + markers: not daily and not remote and not gpu and not doctest + pytest_command: coverage run -m pytest - name: cpu-doctest - container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py steps: diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index 447f824e67..d3f1e8e90e 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -1,6 +1,6 @@ name: PR GPU tests on: - pull_request_target: + pull_request: workflow_dispatch: # Cancel old runs when a new commit is pushed to the same branch if not on main # or dev @@ -15,8 +15,8 @@ jobs: strategy: matrix: include: - - name: gpu-3.11-2.4-1 - container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 + - name: gpu-3.11-2.5-1 + container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04 markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml @@ -44,8 +44,8 @@ jobs: strategy: matrix: include: - - name: gpu-3.11-2.4-2 - container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 + - name: gpu-3.11-2.5-2 + container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04 markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml @@ -74,8 +74,8 @@ jobs: strategy: matrix: include: - - name: gpu-3.11-2.4-4 - container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 + - name: gpu-3.11-2.5-4 + container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04 markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml diff --git a/composer/checkpoint/state_dict.py b/composer/checkpoint/state_dict.py index 0c9e1606d2..9a843b14cc 100644 --- a/composer/checkpoint/state_dict.py +++ b/composer/checkpoint/state_dict.py @@ -88,7 +88,7 @@ def get_model_state_dict( log.debug('Calling model.state_dict() for non-FSDP model...') model_state_dict = model.state_dict() if isinstance(model, DistributedDataParallel): - nn.modules.utils.consume_prefix_in_state_dict_if_present(model_state_dict, 'module.') + nn.modules.utils.consume_prefix_in_state_dict_if_present(model_state_dict, 'module.') # type: ignore if include_keys is not None: model_state_dict = _extract_keys_from_state_dict(model_state_dict, include_keys) diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py index acf7f7d10f..72550c12e2 100644 --- a/composer/models/huggingface.py +++ b/composer/models/huggingface.py @@ -916,7 +916,7 @@ def write_huggingface_pretrained_from_composer_checkpoint( peft_config.save_pretrained(str(output_folder)) weights_state_dict = composer_state_dict['state']['model'] - torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(weights_state_dict, prefix='model.') + torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(weights_state_dict, prefix='model.') # type: ignore # NOTE: This only works for default adapter name, not multiple adapters if peft_config is not None: diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py index 77c4d733f7..fd7a6c9df8 100644 --- a/composer/trainer/_patch_pytorch.py +++ b/composer/trainer/_patch_pytorch.py @@ -106,7 +106,13 @@ def patch_pytorch(): elif version.parse(torch.__version__) < version.parse('2.4.1'): # Monkey patch for torch < 2.4.1 ie torch == 2.4.0 - # No monkeypatches! + # No monkeypatches besides unshard (below)! + pass + + elif version.parse(torch.__version__) < version.parse('2.5.1'): + # Monkey patch for torch < 2.5.1 ie torch == 2.5.0 + + # No monkeypatches besides unshard (below)! pass @@ -1046,3 +1052,52 @@ def unshard_with_sync(self): raise RuntimeError('CUDA out of memory encountered on a different rank') padded_unsharded_flat_param = self._all_gather_flat_param(unsharded_flat_param) self._use_unsharded_flat_param(padded_unsharded_flat_param) + +if version.parse(torch.__version__) >= version.parse('2.5.0') and version.parse( + torch.__version__, +) < version.parse('2.5.1'): + + # Save original FlatParamHandle.unshard to revert back to when dropping automicrobatching hooks + from torch.distributed.fsdp._flat_param import FlatParamHandle + original_unshard = FlatParamHandle.unshard + + @no_type_check + def unshard_with_sync(self): + """Run the unshard logic, but with a sync after a :meth:`_alloc_padded_unsharded_flat_param`. + + This prevents deadlocks when some ranks OOM after the alloc call and others do not. + This is a patched method from pytorch, meant to be called when automicrobatching + turns on hooks in its search process for the optimal non-OOMing microbatch size. + This includes all-gathering the flat parameter + and switching to using the unsharded flat parameter. If the handle does + not need unsharding, then this only switches to using the unsharded + flat parameter. For ``NO_SHARD``, this is a no-op. + If FSDP is in :meth:`summon_full_params` and the handle uses parameter + mixed precision, then the parameter is forced to full precision. + """ + if not self.needs_unshard(): + # Even when not needing an unshard, we should switch to using + # the unsharded flat parameter + unsharded_flat_param = ( + self._get_padded_unsharded_flat_param() + if self.uses_sharded_strategy + else self.flat_param + ) + self._use_unsharded_flat_param(unsharded_flat_param) + return + unsharded_flat_param = self._alloc_padded_unsharded_flat_param() + + # Check if any other rank hit an OOM + found_cuda_oom_tensor = torch.tensor([0], dtype=torch.uint8).to(self.device, non_blocking=True) + + dist.all_reduce(found_cuda_oom_tensor, reduce_operation='MAX') + found_cuda_oom = found_cuda_oom_tensor.item() + # Signal current rank is still in batch + all_ranks_finished_tensor = torch.tensor([0], dtype=torch.uint8).to(self.device, non_blocking=True) + + dist.all_reduce(all_ranks_finished_tensor, reduce_operation='MIN') + + if found_cuda_oom == 1: + raise RuntimeError('CUDA out of memory encountered on a different rank') + padded_unsharded_flat_param = self._all_gather_flat_param(unsharded_flat_param) + self._use_unsharded_flat_param(padded_unsharded_flat_param) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index e17f5cf7a6..db7752f879 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -2317,9 +2317,11 @@ def fit( self.state.max_duration = duration + self.state.timestamp.get(duration.unit) # Raise error if callig fit with SGD - if type( - self.state.optimizers[0], - ) == torch.optim.SGD and version.parse(torch.__version__) >= version.parse('2.4.0'): + if ( + type(self.state.optimizers[0]) == torch.optim.SGD and + version.parse(torch.__version__) >= version.parse('2.4.0') and + version.parse(torch.__version__) < version.parse('2.5.0') + ): raise ValueError( 'PyTorch 2.4 breaks (distributed) checkpointing with SGD. ' 'Please use a different optimizer, e.g. composer.optim.DecoupledSGDW, ' diff --git a/docker/README.md b/docker/README.md index fd68d04951..ca047829ad 100644 --- a/docker/README.md +++ b/docker/README.md @@ -30,15 +30,15 @@ To install composer, once inside the image, run `pip install mosaicml`. | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Ubuntu 20.04 | Base | 2.4.1 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.4.1 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.4.1 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.5.0 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.5.0 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.5.0 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.4.1 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.4.1 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.4.1 | cpu | 3.11 | `mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.3.1 | cpu | 3.11 | `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.2.2 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.2.2 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.2.2 | cpu | 3.11 | `mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04` | **Note**: The `mosaicml/pytorch:latest`, `mosaicml/pytorch:latest_cpu`, and `mosaicml/pytorch:latest-aws` diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 65b8e747a1..b3676f2012 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -2,129 +2,103 @@ - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 CUDA_VERSION: 12.4.1 - IMAGE_NAME: torch-2-4-1-cu124 + IMAGE_NAME: torch-2-5-0-cu124 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.4.1 + PYTORCH_VERSION: 2.5.0 TAGS: - - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 - - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04 + - ghcr.io/databricks-mosaic/pytorch:2.5.0_cu124-python3.11-ubuntu20.04 - mosaicml/pytorch:latest - ghcr.io/databricks-mosaic/pytorch:latest TARGET: pytorch_stage - TORCHVISION_VERSION: 0.19.1 + TORCHVISION_VERSION: 0.20.0 - AWS_OFI_NCCL_VERSION: v1.11.0-aws BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 CUDA_VERSION: 12.4.1 - IMAGE_NAME: torch-2-4-1-cu124-aws + IMAGE_NAME: torch-2-5-0-cu124-aws MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.4.1 + PYTORCH_VERSION: 2.5.0 TAGS: - - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws - - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04-aws + - ghcr.io/databricks-mosaic/pytorch:2.5.0_cu124-python3.11-ubuntu20.04-aws - mosaicml/pytorch:latest-aws - ghcr.io/databricks-mosaic/pytorch:latest-aws TARGET: pytorch_stage - TORCHVISION_VERSION: 0.19.1 + TORCHVISION_VERSION: 0.20.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' - IMAGE_NAME: torch-2-4-1-cpu + IMAGE_NAME: torch-2-5-0-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.4.1 + PYTORCH_VERSION: 2.5.0 TAGS: - - mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 - - ghcr.io/databricks-mosaic/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04 + - ghcr.io/databricks-mosaic/pytorch:2.5.0_cpu-python3.11-ubuntu20.04 - mosaicml/pytorch:latest_cpu - ghcr.io/databricks-mosaic/pytorch:latest_cpu TARGET: pytorch_stage - TORCHVISION_VERSION: 0.19.1 + TORCHVISION_VERSION: 0.20.0 - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-2-3-1-cu121 + BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 + CUDA_VERSION: 12.4.1 + IMAGE_NAME: torch-2-4-1-cu124 MOFED_VERSION: latest-23.10 - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.3.1 + PYTORCH_VERSION: 2.4.1 TAGS: - - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 - - ghcr.io/databricks-mosaic/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 + - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.18.1 + TORCHVISION_VERSION: 0.19.1 - AWS_OFI_NCCL_VERSION: v1.11.0-aws - BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-2-3-1-cu121-aws + BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 + CUDA_VERSION: 12.4.1 + IMAGE_NAME: torch-2-4-1-cu124-aws MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.3.1 + PYTORCH_VERSION: 2.4.1 TAGS: - - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws - - ghcr.io/databricks-mosaic/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws + - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws TARGET: pytorch_stage - TORCHVISION_VERSION: 0.18.1 + TORCHVISION_VERSION: 0.19.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' - IMAGE_NAME: torch-2-3-1-cpu + IMAGE_NAME: torch-2-4-1-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.3.1 + PYTORCH_VERSION: 2.4.1 TAGS: - - mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 - - ghcr.io/databricks-mosaic/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 + - ghcr.io/databricks-mosaic/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.18.1 + TORCHVISION_VERSION: 0.19.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-2-2-2-cu121 + IMAGE_NAME: torch-2-3-1-cu121 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -143,16 +117,16 @@ PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.2.2 + PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04 - - ghcr.io/databricks-mosaic/pytorch:2.2.2_cu121-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 + - ghcr.io/databricks-mosaic/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.17.2 + TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: v1.11.0-aws BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-2-2-2-cu121-aws + IMAGE_NAME: torch-2-3-1-cu121-aws MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -171,27 +145,27 @@ PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.2.2 + PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws - - ghcr.io/databricks-mosaic/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws + - ghcr.io/databricks-mosaic/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws TARGET: pytorch_stage - TORCHVISION_VERSION: 0.17.2 + TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' - IMAGE_NAME: torch-2-2-2-cpu + IMAGE_NAME: torch-2-3-1-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.2.2 + PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04 - - ghcr.io/databricks-mosaic/pytorch:2.2.2_cpu-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 + - ghcr.io/databricks-mosaic/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.17.2 + TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.25.0 @@ -202,14 +176,14 @@ PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.4.1 + PYTORCH_VERSION: 2.5.0 TAGS: - mosaicml/composer:0.25.0 - ghcr.io/databricks-mosaic/composer:0.25.0 - mosaicml/composer:latest - ghcr.io/databricks-mosaic/composer:latest TARGET: composer_stage - TORCHVISION_VERSION: 0.19.1 + TORCHVISION_VERSION: 0.20.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.25.0 @@ -220,11 +194,11 @@ PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.4.1 + PYTORCH_VERSION: 2.5.0 TAGS: - mosaicml/composer:0.25.0_cpu - ghcr.io/databricks-mosaic/composer:0.25.0_cpu - mosaicml/composer:latest_cpu - ghcr.io/databricks-mosaic/composer:latest_cpu TARGET: composer_stage - TORCHVISION_VERSION: 0.19.1 + TORCHVISION_VERSION: 0.20.0 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index a3336a3d19..4a931ec2e7 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -20,16 +20,16 @@ import yaml PRODUCTION_PYTHON_VERSION = '3.11' -PRODUCTION_PYTORCH_VERSION = '2.4.1' +PRODUCTION_PYTORCH_VERSION = '2.5.0' def _get_torchvision_version(pytorch_version: str): + if pytorch_version == '2.5.0': + return '0.20.0' if pytorch_version == '2.4.1': return '0.19.1' if pytorch_version == '2.3.1': return '0.18.1' - if pytorch_version == '2.2.2': - return '0.17.2' raise ValueError(f'Invalid pytorch_version: {pytorch_version}') @@ -45,12 +45,12 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool): # From https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/ if not use_cuda: return '' + if pytorch_version == '2.5.0': + return '12.4.1' if pytorch_version == '2.4.1': return '12.4.1' if pytorch_version == '2.3.1': return '12.1.1' - if pytorch_version == '2.2.2': - return '12.1.1' raise ValueError(f'Invalid pytorch_version: {pytorch_version}') @@ -180,7 +180,7 @@ def _write_table(table_tag: str, table_contents: str): def _main(): - python_pytorch_versions = [('3.11', '2.4.1'), ('3.11', '2.3.1'), ('3.11', '2.2.2')] + python_pytorch_versions = [('3.11', '2.5.0'), ('3.11', '2.4.1'), ('3.11', '2.3.1')] cuda_options = [True, False] stages = ['pytorch_stage'] interconnects = ['mellanox', 'EFA'] # mellanox is default, EFA needed for AWS diff --git a/setup.py b/setup.py index d1511cb171..c08a958a08 100644 --- a/setup.py +++ b/setup.py @@ -80,8 +80,8 @@ def package_files(prefix: str, directory: str, extension: str): 'tqdm>=4.62.3,<5', 'torchmetrics>=1.4.0.post0,<1.4.1', 'torch_optimizer>=0.3.0,<0.4', - 'torchvision>=0.14.0,<0.19.2', - 'torch>=2.2.0,<2.4.2', + 'torchvision>=0.18.0,<0.20.1', + 'torch>=2.3.0,<2.5.1', 'requests>=2.26.0,<3', 'numpy>=1.21.5,<2.2.0', 'psutil>=5.8.0,<7', diff --git a/tests/checkpoint/helpers.py b/tests/checkpoint/helpers.py index b77741ae46..52838c9aa5 100644 --- a/tests/checkpoint/helpers.py +++ b/tests/checkpoint/helpers.py @@ -8,7 +8,7 @@ from packaging import version from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.distributed.fsdp.api import CPUOffload -from torch.optim import adam +from torch.optim import Adam from torch.optim.lr_scheduler import StepLR from torch.utils.data import DataLoader @@ -183,7 +183,7 @@ def init_optimizer( inputs = torch.randn(batch_size, num_features, device=device) targets = torch.randint(low=0, high=num_classes, size=(batch_size,), device=device, dtype=torch.long) batch = (inputs, targets) if use_composer_model else inputs - optimizer = adam.Adam(model.parameters()) + optimizer = Adam(model.parameters()) outputs = model(batch) loss = loss_fn(outputs, targets) loss.backward() diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py index d2679c2868..3e93ce56b3 100644 --- a/tests/trainer/test_checkpoint.py +++ b/tests/trainer/test_checkpoint.py @@ -417,8 +417,11 @@ def test_checkpoint_saver_properly_constructed( # See https://github.com/pytorch/pytorch/issues/133415 @pytest.mark.xfail @pytest.mark.skipif( - version.parse(torch.__version__) < version.parse('2.4.0'), - reason='Test only applies to PyTorch 2.4+', + ( + version.parse(torch.__version__) < version.parse('2.4.0') or + version.parse(torch.__version__) >= version.parse('2.5.0') + ), + reason='Test only applies to PyTorch 2.4.x', ) def test_sgd_checkpoint( self,