diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml
index c50215da49a..3eebc5d0582 100644
--- a/.github/workflows/precommit.yml
+++ b/.github/workflows/precommit.yml
@@ -95,7 +95,7 @@ jobs:
     defaults:
       run:
         shell: bash
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-20.04-4-cores
     steps:
       - uses: actions/checkout@v3
         with:
@@ -125,6 +125,8 @@ jobs:
       - name: Runner info
         continue-on-error: true
         run: |
+          export PATH=/usr/local/cuda-12.1/bin${PATH:+:${PATH}}
+          export LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
           nvidia-smi
           cat /proc/cpuinfo
           nvcc --version
@@ -140,4 +142,7 @@ jobs:
         run: |
           python -c "import torch; print(torch.cuda.is_available())"
       - name: Run PyTorch precommit test scope
-        run: make test-torch-cuda
+        run: |
+          export PATH=/usr/local/cuda-12.1/bin${PATH:+:${PATH}}
+          export LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+          make test-torch-cuda
diff --git a/tests/torch/pruning/test_tensor_processor.py b/tests/torch/pruning/test_tensor_processor.py
index 67ae0a3947b..b4267df1b12 100644
--- a/tests/torch/pruning/test_tensor_processor.py
+++ b/tests/torch/pruning/test_tensor_processor.py
@@ -16,10 +16,10 @@
 from nncf.torch.tensor import PTNNCFTensor
 
 
-@pytest.mark.parametrize("device", (torch.device("cpu"), torch.device("cuda")))
-def test_ones(device):
-    if not torch.cuda.is_available() and device == torch.device("cuda"):
+def test_ones(use_cuda):
+    if use_cuda and not torch.cuda.is_available():
         pytest.skip("There are no available CUDA devices")
+    device = torch.device("cuda" if use_cuda else "cpu")
     shape = [1, 3, 10, 100]
     tensor = PTNNCFPruningTensorProcessor.ones(shape, device)
     assert torch.is_tensor(tensor.tensor)
diff --git a/tests/torch/ptq/test_fast_bias_correction.py b/tests/torch/ptq/test_fast_bias_correction.py
index 6bbdc342e0b..61d98ac7bdd 100644
--- a/tests/torch/ptq/test_fast_bias_correction.py
+++ b/tests/torch/ptq/test_fast_bias_correction.py
@@ -62,6 +62,7 @@ def check_bias(model: NNCFNetwork, ref_bias: list):
         raise ValueError("Not found node with bias")
 
 
+@pytest.mark.cuda
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Skipping for CPU-only setups")
 class TestTorchCudaFBCAlgorithm(TestTorchFBCAlgorithm):
     @staticmethod
diff --git a/tests/torch/ptq/test_reducers_and_aggregators.py b/tests/torch/ptq/test_reducers_and_aggregators.py
index 1af7b4e4683..84cb20fb9ea 100644
--- a/tests/torch/ptq/test_reducers_and_aggregators.py
+++ b/tests/torch/ptq/test_reducers_and_aggregators.py
@@ -87,6 +87,7 @@ def all_close(self, val: torch.Tensor, ref) -> bool:
         return super().all_close(val, ref)
 
 
+@pytest.mark.cuda
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Cuda is not available in current environment")
 class TestCudaReducersAggregators(BaseTestReducersAggregators):
     def get_nncf_tensor(self, x: np.array, dtype: Optional[Dtype] = None):
@@ -97,11 +98,11 @@ def all_close(self, val: torch.Tensor, ref) -> bool:
         return super().all_close(val, ref)
 
 
-@pytest.mark.parametrize("device", ["cuda", "cpu"])
 @pytest.mark.parametrize("size,ref", [(16_000_000, 1_600_000.8750), (17_000_000, 1_700_000.7500)])
-def test_quantile_percentile_function(device, size, ref):
-    if not torch.cuda.is_available() and device == "cuda":
+def test_quantile_percentile_function(use_cuda, size, ref):
+    if use_cuda and not torch.cuda.is_available():
         pytest.skip("Cuda is not available in current environment")
+    device = "cuda" if use_cuda else "cpu"
     tensor = PTNNCFTensor(torch.arange(1, size, 1).float().to(device))
     res_quantile = PTNNCFCollectorTensorProcessor.quantile(tensor, [0.1], axis=0)
     res_percentile = PTNNCFCollectorTensorProcessor.percentile(tensor, [10], axis=0)
@@ -111,11 +112,11 @@ def test_quantile_percentile_function(device, size, ref):
         assert tensor.is_cuda == (device == "cuda")
 
 
-@pytest.mark.parametrize("device", ["cuda", "cpu"])
 @pytest.mark.parametrize("size,ref", [(16_000_000, 8_000_000), (17_000_000, 8_500_000)])
-def test_median_function(device, size, ref):
-    if not torch.cuda.is_available() and device == "cuda":
+def test_median_function(use_cuda, size, ref):
+    if use_cuda and not torch.cuda.is_available():
         pytest.skip("Cuda is not available in current environment")
+    device = "cuda" if use_cuda else "cpu"
     tensor = PTNNCFTensor(torch.arange(1, size, 1).float().to(device))
     res = PTNNCFCollectorTensorProcessor.median(tensor, axis=0)
     assert res.tensor == ref
diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py
index ba64f1341b8..8cb5e00932f 100644
--- a/tests/torch/ptq/test_weights_compression.py
+++ b/tests/torch/ptq/test_weights_compression.py
@@ -243,12 +243,11 @@ def test_get_dtype_attribute_of_parameter():
     assert compressed_model.weight.dtype == torch.uint8
 
 
-@pytest.mark.parametrize("device", ("cpu", "cuda"))
 @pytest.mark.parametrize("dtype", ("float16", "float32"))
-def test_model_devices_and_precisions(device, dtype):
-    if device == "cuda" and not torch.cuda.is_available():
+def test_model_devices_and_precisions(use_cuda, dtype):
+    if use_cuda and not torch.cuda.is_available():
         pytest.skip("Skipping for CPU-only setups")
-    device = torch.device(device)
+    device = torch.device("cuda" if use_cuda else "cpu")
     dtype = torch.float16 if dtype == "float16" else torch.float32
 
     model = MatMulModel().to(device)
diff --git a/tests/torch/quantization/test_algo_quantization.py b/tests/torch/quantization/test_algo_quantization.py
index 4a70ceebee2..7ad9ce2d598 100644
--- a/tests/torch/quantization/test_algo_quantization.py
+++ b/tests/torch/quantization/test_algo_quantization.py
@@ -232,6 +232,7 @@ def activation_quantizers_dumping_worker(current_gpu, config, tmp_path):
             f.writelines("%s\n" % str(aq_id))
 
 
+@pytest.mark.cuda
 def test_activation_quantizers_order_is_the_same__for_resnet50(tmp_path, runs_subprocess_in_precommit):
     if not torch.cuda.is_available():
         pytest.skip("Skipping CUDA test cases for CPU only setups")
@@ -803,7 +804,8 @@ def test_internal_autocast_model(self, initializing_config: NNCFConfig):
         compressed_model(inputs)
 
     @pytest.mark.parametrize(
-        "device", [pytest.param("cuda"), pytest.param("cpu", marks=pytest.mark.skip(reason="CVS-86697"))]
+        "device",
+        [pytest.param("cuda", marks=pytest.mark.cuda), pytest.param("cpu", marks=pytest.mark.skip(reason="CVS-86697"))],
     )
     def test_manual_partial_half_precision_model(self, initializing_config: NNCFConfig, device: str):
         model = TestHalfPrecisionModels.ModelWithManualPartialHalfPrecision()
@@ -821,11 +823,10 @@ def test_manual_partial_half_precision_model(self, initializing_config: NNCFConf
         # Should complete successfully, including init.
         compressed_model(inputs)
 
-    @pytest.mark.parametrize("device", ["cpu", "cuda"])
-    def test_external_autocast(self, initializing_config: NNCFConfig, device: str):
+    def test_external_autocast(self, initializing_config: NNCFConfig, use_cuda):
         model = TestHalfPrecisionModels.RegularModel()
         inputs = torch.ones([1, 1, 1, 1])
-        if device == "cuda":
+        if use_cuda:
             if not torch.cuda.is_available():
                 pytest.skip("CUDA not available")
             inputs = inputs.cuda()
@@ -941,6 +942,7 @@ def test_can_quantize_user_module_with_addmm():
     create_compressed_model_and_algo_for_test(ModelWithUserModule(), nncf_config)
 
 
+@pytest.mark.cuda
 def test_works_when_wrapped_with_dataparallel():
     if not torch.cuda.is_available():
         pytest.xfail("The executing host must have > 1 CUDA GPU in order for this test to be relevant.")
diff --git a/tests/torch/quantization/test_autoq_precision_init.py b/tests/torch/quantization/test_autoq_precision_init.py
index 084caf8f6d1..97b851d591f 100644
--- a/tests/torch/quantization/test_autoq_precision_init.py
+++ b/tests/torch/quantization/test_autoq_precision_init.py
@@ -137,6 +137,7 @@ def __str__(self):
 )
 
 
+@pytest.mark.cuda
 @pytest.mark.parametrize("params", AUTOQ_TEST_PARAMS, ids=[str(p) for p in AUTOQ_TEST_PARAMS])
 def test_autoq_precision_init(_seed, dataset_dir, tmp_path, mocker, params):
     config = params.config_builder.build()
diff --git a/tests/torch/quantization/test_functions.py b/tests/torch/quantization/test_functions.py
index b468f8c43d5..18ebd23afa6 100644
--- a/tests/torch/quantization/test_functions.py
+++ b/tests/torch/quantization/test_functions.py
@@ -614,12 +614,12 @@ class TestParametrizedLong(BaseParametrized):
     pass
 
 
-@pytest.mark.parametrize("device", ["cuda", "cpu"])
-def test_mapping_to_zero(quantization_mode, device):
+def test_mapping_to_zero(use_cuda, quantization_mode):
     torch.manual_seed(42)
 
-    if not torch.cuda.is_available() and device == "cuda":
+    if use_cuda and not torch.cuda.is_available():
         pytest.skip("Skipping CUDA test cases for CPU only setups")
+    device = "cuda" if use_cuda else "cpu"
     x_zero = torch.zeros([1]).to(torch.device(device))
     levels = 256
     eps = 1e-6
diff --git a/tests/torch/quantization/test_hawq_precision_init.py b/tests/torch/quantization/test_hawq_precision_init.py
index 440981a2360..c056ba96216 100644
--- a/tests/torch/quantization/test_hawq_precision_init.py
+++ b/tests/torch/quantization/test_hawq_precision_init.py
@@ -614,6 +614,7 @@ def precision_init_dumping_worker(gpu, ngpus_per_node, config, tmp_path):
     torch.save(act_bitwidth_per_scope, str(out_file_path))
 
 
+@pytest.mark.cuda
 def test_can_broadcast_initialized_precisions_in_distributed_mode(tmp_path, runs_subprocess_in_precommit):
     if not torch.cuda.is_available():
         pytest.skip("Skipping CUDA test cases for CPU only setups")
diff --git a/tests/torch/sparsity/const/test_algo.py b/tests/torch/sparsity/const/test_algo.py
index baf61a4057d..ebc91d511ff 100644
--- a/tests/torch/sparsity/const/test_algo.py
+++ b/tests/torch/sparsity/const/test_algo.py
@@ -76,7 +76,9 @@ def test_can_restore_binary_mask_on_magnitude_algo_resume():
     PTTensorListComparator.check_equal(ref_mask_2, op.operand.binary_mask)
 
 
-@pytest.mark.parametrize("use_data_parallel", [True, False], ids=["dataparallel", "regular"])
+@pytest.mark.parametrize(
+    "use_data_parallel", [pytest.param(True, marks=pytest.mark.cuda), False], ids=["dataparallel", "regular"]
+)
 def test_can_restore_binary_mask_on_magnitude_quant_algo_resume(tmp_path, use_data_parallel):
     config = get_empty_config()
     config["compression"] = [
diff --git a/tests/torch/sparsity/movement/test_components.py b/tests/torch/sparsity/movement/test_components.py
index 5db4292301f..13b75f20728 100644
--- a/tests/torch/sparsity/movement/test_components.py
+++ b/tests/torch/sparsity/movement/test_components.py
@@ -323,7 +323,6 @@ class TestFunctions:
         ],
     )
     @pytest.mark.parametrize("requires_grad", [True, False])
-    @pytest.mark.parametrize("use_cuda", [True, False])
     def test_binary_mask_by_threshold(
         self,
         input_tensor: torch.Tensor,
@@ -385,6 +384,7 @@ def test_importance_loss_forward(self, desc, requires_grad: bool, use_cuda: bool
             assert output.requires_grad is requires_grad
             assert torch.allclose(output, torch.tensor(desc["ref_output"]))
 
+    @pytest.mark.gpu
     def test_importance_loss_adapts_to_device_change(self):
         if not torch.cuda.is_available():
             pytest.skip("requires GPU")
diff --git a/tests/torch/test_algo_common.py b/tests/torch/test_algo_common.py
index 8b7be3fc382..a7d64d851db 100644
--- a/tests/torch/test_algo_common.py
+++ b/tests/torch/test_algo_common.py
@@ -379,6 +379,7 @@ def get_basic_rb_sparsity_int8_config():
 ]
 
 
+@pytest.mark.cuda
 @pytest.mark.parametrize(
     "config",
     comp_loss_configs,
diff --git a/tests/torch/test_api_behavior.py b/tests/torch/test_api_behavior.py
index 9e18479e94e..e7cfe3a6618 100644
--- a/tests/torch/test_api_behavior.py
+++ b/tests/torch/test_api_behavior.py
@@ -122,7 +122,10 @@ def forward(self, x):
         return self.model.forward(x)
 
 
-@pytest.mark.parametrize("original_device", ["cpu", "cuda", "cuda:0"])
+@pytest.mark.parametrize(
+    "original_device",
+    ["cpu", pytest.param("cuda", marks=pytest.mark.cuda), pytest.param("cuda:0", marks=pytest.mark.cuda)],
+)
 def test_model_is_inited_with_own_device_by_default(nncf_config_with_default_init_args, original_device):
     if not torch.cuda.is_available() and "cuda" in original_device:
         pytest.skip("Skipping for CPU-only setups")
diff --git a/tests/torch/test_graph_building.py b/tests/torch/test_graph_building.py
index 01b6695045d..11ff4d984c1 100644
--- a/tests/torch/test_graph_building.py
+++ b/tests/torch/test_graph_building.py
@@ -374,10 +374,10 @@ def test_filler_input_info_arg_generation(filler_gen_test_struct: FillerInputInf
     ],
     ids=["filler", "example", "loader"],
 )
-@pytest.mark.parametrize("device", ["cuda", "cpu"])
-def test_input_infos_respect_device_setting(input_info: ModelInputInfo, device: str):
-    if device == "cuda" and not torch.cuda.is_available():
+def test_input_infos_respect_device_setting(input_info: ModelInputInfo, use_cuda: bool):
+    if use_cuda and not torch.cuda.is_available():
         pytest.skip("Skipped checking CUDA device test cases on CPU-only hosts")
+    device = "cuda" if use_cuda else "cpu"
     forward_inputs = input_info.get_forward_inputs(device)
 
     def assert_on_device(x: torch.Tensor):
diff --git a/tests/torch/test_knowledge_distillation.py b/tests/torch/test_knowledge_distillation.py
index da4b4fdd471..b129c816e6e 100644
--- a/tests/torch/test_knowledge_distillation.py
+++ b/tests/torch/test_knowledge_distillation.py
@@ -62,7 +62,15 @@ def get_sparsity_config_with_sparsity_init(config: NNCFConfig, sparsity_init=0.5
     return config
 
 
-@pytest.mark.parametrize("inference_type", ["cpu", "single_GPU", "DP", "DDP"])
+@pytest.mark.parametrize(
+    "inference_type",
+    [
+        "cpu",
+        pytest.param("single_GPU", marks=pytest.mark.cuda),
+        pytest.param("DP", marks=pytest.mark.cuda),
+        pytest.param("DDP", marks=pytest.mark.cuda),
+    ],
+)
 def test_knowledge_distillation_training_process(inference_type: str):
     if not torch.cuda.is_available() and inference_type != "cpu":
         pytest.skip("Skipping CUDA test cases for CPU only setups")
@@ -311,7 +319,15 @@ def test_kd_sparsity_statistics(algo: str):
 
 
 @pytest.mark.parametrize("device_placing", ["before", "after"])
-@pytest.mark.parametrize("inference_type", ["cpu", "single_GPU", "DP", "DDP"])
+@pytest.mark.parametrize(
+    "inference_type",
+    [
+        "cpu",
+        pytest.param("single_GPU", marks=pytest.mark.cuda),
+        pytest.param("DP", marks=pytest.mark.cuda),
+        pytest.param("DDP", marks=pytest.mark.cuda),
+    ],
+)
 def test_model_device_before_create_compressed_model(device_placing, inference_type):
     if not torch.cuda.is_available() and inference_type != "cpu":
         pytest.skip("Skipping CUDA test cases for CPU only setups")
diff --git a/tests/torch/test_model_transformer.py b/tests/torch/test_model_transformer.py
index c554a39ccb3..ce9c2c5e6e1 100644
--- a/tests/torch/test_model_transformer.py
+++ b/tests/torch/test_model_transformer.py
@@ -201,7 +201,7 @@ def to(self, device):
             self.to_device = device
 
     @pytest.mark.parametrize("target_point", available_points)
-    @pytest.mark.parametrize("multidevice", (False, True))
+    @pytest.mark.parametrize("multidevice", (False, pytest.param(True, marks=pytest.mark.cuda)))
     @pytest.mark.parametrize("hook", (lambda x: x, BaseOpWithParam(lambda x: x).cpu()))
     def test_pt_insertion_command(self, target_point: PTTargetPoint, multidevice: bool, hook):
         model = wrap_model(InsertionPointTestModel(), torch.ones([1, 1, 10, 10]))
@@ -696,7 +696,7 @@ def test_create_shared_quantizer_insertion_command():
     "priority", [TransformationPriority.FP32_TENSOR_STATISTICS_OBSERVATION, TransformationPriority.DEFAULT_PRIORITY]
 )
 @pytest.mark.parametrize("compression_module_registered", [False, True])
-@pytest.mark.parametrize("multidevice_model", (False, True))
+@pytest.mark.parametrize("multidevice_model", (False, pytest.param(True, marks=pytest.mark.cuda)))
 def test_shared_fn_insertion_point(
     priority, compression_module_registered, compression_module_type, multidevice_model, mocker
 ):
@@ -786,7 +786,7 @@ def _insert_external_op_mocked():
     "priority", [TransformationPriority.FP32_TENSOR_STATISTICS_OBSERVATION, TransformationPriority.DEFAULT_PRIORITY]
 )
 @pytest.mark.parametrize("compression_module_registered", [False, True])
-@pytest.mark.parametrize("multidevice_model", (False, True))
+@pytest.mark.parametrize("multidevice_model", (False, pytest.param(True, marks=pytest.mark.cuda)))
 def test_shared_fn_insertion_command_several_module_types(
     priority, compression_module_registered, multidevice_model, mocker
 ):
diff --git a/tests/torch/test_nncf_network.py b/tests/torch/test_nncf_network.py
index c4da4be8c82..77a03f4ebf6 100644
--- a/tests/torch/test_nncf_network.py
+++ b/tests/torch/test_nncf_network.py
@@ -835,6 +835,7 @@ def forward(self, x, y):
         return res
 
 
+@pytest.mark.cuda
 def test_multidevice_model():
     if not torch.cuda.is_available():
         pytest.skip("GPU required")