From ba392de663c5bef2df8eb2aa719d5f3841cf1117 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 26 Sep 2024 23:41:20 +0000 Subject: [PATCH] fix config to use shard grad op --- configs/150M/3090.toml | 2 +- configs/150M/A40.toml | 2 +- configs/150M/H100.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/150M/3090.toml b/configs/150M/3090.toml index 866d6054..e792dd00 100644 --- a/configs/150M/3090.toml +++ b/configs/150M/3090.toml @@ -3,7 +3,7 @@ project = "debug_150m_zero_band" [train] micro_bs = 16 # change this base on the gpu -sharding_strategy = "NO_SHARD" +sharding_strategy = "SHARD_GRAD_OP" [optim] batch_size = 512 diff --git a/configs/150M/A40.toml b/configs/150M/A40.toml index e7799417..867679c1 100644 --- a/configs/150M/A40.toml +++ b/configs/150M/A40.toml @@ -3,7 +3,7 @@ project = "debug_150m_zero_band" [train] micro_bs = 32 # change this base on the gpu -sharding_strategy = "NO_SHARD" +sharding_strategy = "SHARD_GRAD_OP" [optim] batch_size = 512 diff --git a/configs/150M/H100.toml b/configs/150M/H100.toml index 49a65475..3b5d7dfa 100644 --- a/configs/150M/H100.toml +++ b/configs/150M/H100.toml @@ -3,7 +3,7 @@ project = "debug_150m_zero_band" [train] micro_bs = 64 # change this base on the gpu -sharding_strategy = "NO_SHARD" +sharding_strategy = "SHARD_GRAD_OP" [optim] batch_size = 512