diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml
new file mode 100644
index 00000000..c2196cb3
--- /dev/null
+++ b/.github/workflows/gpu.yml
@@ -0,0 +1,39 @@
+name: Tests on GPU
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    # This will trigger the workflow for pull requests to any branch
+    types: [opened, synchronize, reopened]
+
+jobs:
+  gpu-tests:
+    name: python
+    runs-on: self-hosted
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          enable-cache: true
+          cache-dependency-glob: "uv.lock"
+      
+
+
+      - name: Set up Python 
+        run: uv python install 3.10.13
+
+      - name: Install the project
+        run: uv sync --all-extras --dev
+
+      - name: Install flash attention
+        run: uv pip install flash-attn --no-build-isolation
+
+      - name: Run tests
+        run: uv run pytest tests
\ No newline at end of file
diff --git a/src/zeroband/train.py b/src/zeroband/train.py
index fc47503b..14d02b99 100644
--- a/src/zeroband/train.py
+++ b/src/zeroband/train.py
@@ -21,6 +21,7 @@
 from zeroband.loss import cross_entropy_max_z_loss
 
 from zeroband.utils import (
+    FakeTokenizer,
     GPUMemoryMonitor,
     PerfCounter,
     get_module_signature,
@@ -137,7 +138,9 @@ def train(config: Config):
             config.ckpt.interval % config.diloco.inner_steps == 0
         ), "ckpt interval must be a multiple of diloco inner steps as we only save at the end of an outer step"
 
-    if config.type_model == "llama2":
+    if config.data.fake and config.name_model == "debugmodel":
+        tokenizer = FakeTokenizer()
+    elif config.type_model == "llama2":
         tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=True)
     elif config.type_model == "llama3":
         tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", use_fast=True)
diff --git a/src/zeroband/utils/__init__.py b/src/zeroband/utils/__init__.py
index 2638d156..1bb454fb 100644
--- a/src/zeroband/utils/__init__.py
+++ b/src/zeroband/utils/__init__.py
@@ -229,3 +229,11 @@ def get_random_available_port_list(num_port):
 
 def get_random_available_port():
     return get_random_available_port_list(1)[0]
+
+
+class FakeTokenizer(object):
+    def __init__(self):
+        self.vocab_size = 1000
+        self.bos_token_id = 0
+        self.eos_token_id = 1
+        self.pad_token_id = 2
diff --git a/tests/test_torchrun/test_train.py b/tests/test_torchrun/test_train.py
index 69b66644..e5703fe3 100644
--- a/tests/test_torchrun/test_train.py
+++ b/tests/test_torchrun/test_train.py
@@ -6,6 +6,10 @@
 
 from zeroband.diloco import Compression
 
+import torch
+
+num_gpu = torch.cuda.device_count()
+
 
 def get_random_available_port_list(num_port):
     # https://stackoverflow.com/questions/1365265/on-localhost-how-do-i-pick-a-free-port-number
@@ -77,7 +81,7 @@ def test_multi_gpu(num_gpus):
     _test_multi_gpu(num_gpus, "debug/normal.toml")
 
 
-@pytest.mark.parametrize("num_gpus", [[2, 1], [2, 2]])
+@pytest.mark.parametrize("num_gpus", [[2, 1], [2, 2]] if num_gpu >= 4 else [[2, 1]])
 def test_multi_gpu_diloco(num_gpus):
     _test_multi_gpu(num_gpus, "debug/diloco.toml", diloco=True)