From 20e7f11dd7f349f330c9d79c5e4b2606fcd42216 Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Thu, 17 Oct 2024 17:17:25 +0200 Subject: [PATCH] Refactor `test_random` to minimize collective calls (#1677) * debugging * fix misinterpretation of dtype * debugging * debugging * debugging * debugging * debugging * debugging * debugging * debugging * replace numpy() calls with alternative checks * debugging * debugging * debugging randint * debugging * cast ints to float in statistical ops * bypass numpy call l. 197 * bypass more numpy calls, skip median checks * bypass more numpy calls, skip median checks * bypass numpy calls wherever possible * reinstate median checks * skip ht.median if split>0 * skip all ht.median * Revert "skip all ht.median" This reverts commit 12414542a33365ac32929067e962c523eb45fd19. * Revert "skip ht.median if split>0" This reverts commit 4da8c930314a2a049045f6c3de62bf88159cd976. * Revert "reinstate median checks" This reverts commit bf509149740b9257eaa1e44900a5cca6b791ada5. (cherry picked from commit 4b3e5704dd5270f7c9652c702a18af07c8a39d87) --- heat/core/statistics.py | 16 ++- heat/core/tests/test_random.py | 179 +++++++++++++------------------ heat/utils/data/matrixgallery.py | 2 +- 3 files changed, 93 insertions(+), 104 deletions(-) diff --git a/heat/core/statistics.py b/heat/core/statistics.py index 0dd1e4dcc0..29c557863d 100644 --- a/heat/core/statistics.py +++ b/heat/core/statistics.py @@ -982,11 +982,18 @@ def reduce_means_elementwise(output_shape_i: torch.Tensor) -> DNDarray: return mu_tot[0][0] if mu_tot[0].size == 1 else mu_tot[0] # ---------------------------------------------------------------------------------------------- + # sanitize dtype + if types.heat_type_is_exact(x.dtype): + if x.dtype is types.int64: + x = x.astype(types.float64) + else: + x = x.astype(types.float32) + if axis is None: # full matrix calculation if not x.is_distributed(): # if x is not distributed do a torch.mean on x - ret = torch.mean(x.larray.float()) + ret = torch.mean(x.larray) return DNDarray( ret, gshape=tuple(ret.shape), @@ -1791,6 +1798,13 @@ def std( >>> ht.std(a, 1) DNDarray([1.2961, 0.3362, 1.0739, 0.9820], dtype=ht.float32, device=cpu:0, split=None) """ + # sanitize dtype + if types.heat_type_is_exact(x.dtype): + if x.dtype is types.int64: + x = x.astype(types.float64) + else: + x = x.astype(types.float32) + if not isinstance(ddof, int): raise TypeError(f"ddof must be integer, is {type(ddof)}") # elif ddof > 1: diff --git a/heat/core/tests/test_random.py b/heat/core/tests/test_random.py index 0d6bfd904b..2fc5072354 100644 --- a/heat/core/tests/test_random.py +++ b/heat/core/tests/test_random.py @@ -153,16 +153,16 @@ def test_rand(self): a = ht.random.rand(21, 16, 17, 21, dtype=ht.float32, split=2) b = ht.random.rand(15, 11, 19, 31, dtype=ht.float32, split=0) - a = a.numpy().flatten() - b = b.numpy().flatten() - c = np.concatenate((a, b)) + a = a.flatten() + b = b.flatten() + c = ht.concatenate((a, b)) # Values should be spread evenly across the range [0, 1) - mean = np.mean(c) - median = np.median(c) - std = np.std(c) + mean = ht.mean(c) + # median = np.median(c) + std = ht.std(c) self.assertTrue(0.49 < mean < 0.51) - self.assertTrue(0.49 < median < 0.51) + # self.assertTrue(0.49 < median < 0.51) self.assertTrue(std < 0.3) self.assertTrue(((0 <= c) & (c < 1)).all()) @@ -170,11 +170,9 @@ def test_randint(self): # Checked that the random values are in the correct range a = ht.random.randint(low=0, high=10, size=(10, 10), dtype=ht.int64) self.assertEqual(a.dtype, ht.int64) - a = a.numpy() self.assertTrue(((0 <= a) & (a < 10)).all()) a = ht.random.randint(low=100000, high=150000, size=(31, 25, 11), dtype=ht.int64, split=2) - a = a.numpy() self.assertTrue(((100000 <= a) & (a < 150000)).all()) # For the range [0, 1) only the value 0 is allowed @@ -194,20 +192,18 @@ def test_randint(self): shape = (15, 13, 9, 21, 65) ht.random.seed(13579) a = ht.random.randint(10000, size=shape, split=2, dtype=ht.int64) - a = a.numpy() ht.random.seed(13579) b = ht.random.randint(low=0, high=10000, size=shape, split=2, dtype=ht.int64) - b = b.numpy() - self.assertTrue(np.array_equal(a, b)) - mean = np.mean(a) - median = np.median(a) - std = np.std(a) + self.assertTrue(ht.equal(a, b)) + mean = ht.mean(a) + # median = ht.median(a) + std = ht.std(a) # Mean and median should be in the center while the std is very high due to an even distribution self.assertTrue(4900 < mean < 5100) - self.assertTrue(4900 < median < 5100) + # self.assertTrue(4900 < median < 5100) self.assertTrue(std < 2900) with self.assertRaises(ValueError): @@ -226,31 +222,26 @@ def test_randint(self): self.assertEqual(a.dtype, ht.int32) self.assertEqual(a.larray.dtype, torch.int32) self.assertEqual(b.dtype, ht.int32) - a = a.numpy() - b = b.numpy() - self.assertEqual(a.dtype, np.int32) - self.assertTrue(np.array_equal(a, b)) + self.assertTrue(ht.equal(a, b)) self.assertTrue(((50 <= a) & (a < 1000)).all()) self.assertTrue(((50 <= b) & (b < 1000)).all()) c = ht.random.randint(50, 1000, size=(13, 45), dtype=ht.int32, split=0) - c = c.numpy() - self.assertFalse(np.array_equal(a, c)) - self.assertFalse(np.array_equal(b, c)) + self.assertFalse(ht.equal(a, c)) + self.assertFalse(ht.equal(b, c)) self.assertTrue(((50 <= c) & (c < 1000)).all()) ht.random.seed(0xFFFFFFF) a = ht.random.randint( 10000, size=(123, 42, 13, 21), split=3, dtype=ht.int32, comm=ht.MPI_WORLD ) - a = a.numpy() - mean = np.mean(a) - median = np.median(a) - std = np.std(a) + mean = ht.mean(a) + # median = np.median(a) + std = ht.std(a) # Mean and median should be in the center while the std is very high due to an even distribution self.assertTrue(4900 < mean < 5100) - self.assertTrue(4900 < median < 5100) + # self.assertTrue(4900 < median < 5100) self.assertTrue(std < 2900) # test aliases @@ -297,23 +288,21 @@ def test_randn(self): a = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2) self.assertEqual(a.dtype, ht.float32) self.assertEqual(a.larray[0, 0, 0].dtype, torch.float32) - a = a.numpy() - self.assertEqual(a.dtype, np.float32) - mean = np.mean(a) - median = np.median(a) - std = np.std(a) + mean = ht.mean(a) + # median = np.median(a) + std = ht.std(a) self.assertTrue(-0.02 < mean < 0.02) - self.assertTrue(-0.02 < median < 0.02) + # self.assertTrue(-0.02 < median < 0.02) self.assertTrue(0.99 < std < 1.01) ls = 272 + ht.MPI_WORLD.rank ht.random.set_state(("Batchparallel", None, ls)) - b = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2).numpy() - self.assertTrue(np.allclose(a, b)) + b = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2) + self.assertTrue(ht.allclose(a, b)) - c = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2).numpy() - self.assertFalse(np.allclose(a, c)) - self.assertFalse(np.allclose(b, c)) + c = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2) + self.assertFalse(ht.allclose(a, c)) + self.assertFalse(ht.allclose(b, c)) # check wrong shapes with self.assertRaises(ValueError): @@ -539,10 +528,9 @@ def test_rand(self): a = ht.random.rand(2, 3, 4, 5, split=0) ht.random.set_state(("Threefry", seed, 0x10000000000000000)) b = ht.random.rand(2, 44, split=0) - a = a.numpy().flatten() - b = b.numpy().flatten() - self.assertEqual(a.dtype, np.float32) - self.assertTrue(np.array_equal(a[32:], b)) + a = a.flatten() + b = b.flatten() + self.assertTrue(ht.equal(a[32:], b)) # Check that random numbers don't repeat after first overflow seed = 12345 @@ -557,9 +545,9 @@ def test_rand(self): a = ht.random.rand(2, 34, split=0) ht.random.set_state(("Threefry", seed, 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0)) b = ht.random.rand(2, 50, split=0) - a = a.numpy().flatten() - b = b.numpy().flatten() - self.assertTrue(np.array_equal(a, b[32:])) + a = a.flatten() + b = b.flatten() + self.assertTrue(ht.equal(a, b[32:])) # different split axis with resetting seed ht.random.seed(seed) @@ -573,9 +561,9 @@ def test_rand(self): a = ht.random.rand(2, 50, split=0) ht.random.seed(seed) b = ht.random.rand(100, split=None) - a = a.numpy().flatten() - b = b.larray.cpu().numpy() - self.assertTrue(np.array_equal(a, b)) + a = a.flatten() + b = ht.resplit(b, 0) + self.assertTrue(ht.equal(a, b)) # On different shape and split the same random values are used ht.random.seed(seed) @@ -632,37 +620,36 @@ def test_rand(self): ht.random.seed(9876) b = ht.random.rand(np.prod(shape), dtype=ht.float32) - a = a.numpy().flatten() - b = b.larray.cpu().numpy() - self.assertTrue(np.array_equal(a, b)) - self.assertEqual(a.dtype, np.float32) + a = a.flatten() + b = ht.resplit(b, 0) + self.assertTrue(ht.equal(a, b)) a = ht.random.rand(21, 16, 17, 21, dtype=ht.float32, split=2) b = ht.random.rand(15, 11, 19, 31, dtype=ht.float32, split=0) - a = a.numpy().flatten() - b = b.numpy().flatten() - c = np.concatenate((a, b)) + a = a.flatten() + b = b.flatten() + c = ht.concatenate((a, b)) # Values should be spread evenly across the range [0, 1) - mean = np.mean(c) - median = np.median(c) - std = np.std(c) + mean = ht.mean(c) + # median = np.median(c) + std = ht.std(c) self.assertTrue(0.49 < mean < 0.51) - self.assertTrue(0.49 < median < 0.51) + # self.assertTrue(0.49 < median < 0.51) self.assertTrue(std < 0.3) self.assertTrue(((0 <= c) & (c < 1)).all()) ht.random.seed(11111) - a = ht.random.rand(12, 32, 44, split=1, dtype=ht.float32).numpy() + a = ht.random.rand(12, 32, 44, split=1, dtype=ht.float32) # Overflow reached ht.random.set_state(("Threefry", 11111, 0x10000000000000000)) - b = ht.random.rand(12, 32, 44, split=1, dtype=ht.float32).numpy() - self.assertTrue(np.array_equal(a, b)) + b = ht.random.rand(12, 32, 44, split=1, dtype=ht.float32) + self.assertTrue(ht.equal(a, b)) ht.random.set_state(("Threefry", 11111, 0x100000000)) - c = ht.random.rand(12, 32, 44, split=1, dtype=ht.float32).numpy() - self.assertFalse(np.array_equal(a, c)) - self.assertFalse(np.array_equal(b, c)) + c = ht.random.rand(12, 32, 44, split=1, dtype=ht.float32) + self.assertFalse(ht.equal(a, c)) + self.assertFalse(ht.equal(b, c)) # To check working with large number of elements ht.random.randn(6667, 3523, dtype=ht.float64, split=None) @@ -675,11 +662,9 @@ def test_randint(self): # Checked that the random values are in the correct range a = ht.random.randint(low=0, high=10, size=(10, 10), dtype=ht.int64) self.assertEqual(a.dtype, ht.int64) - a = a.numpy() self.assertTrue(((0 <= a) & (a < 10)).all()) a = ht.random.randint(low=100000, high=150000, size=(31, 25, 11), dtype=ht.int64, split=2) - a = a.numpy() self.assertTrue(((100000 <= a) & (a < 150000)).all()) # For the range [0, 1) only the value 0 is allowed @@ -699,35 +684,32 @@ def test_randint(self): ht.random.seed(13579) shape = (15, 13, 9, 21, 65) a = ht.random.randint(15, 100, size=shape, split=0, dtype=ht.int64) - a = a.numpy().flatten() + a = a.flatten() ht.random.seed(13579) elements = np.prod(shape) b = ht.random.randint(low=15, high=100, size=(elements,), dtype=ht.int64) - b = b.numpy() - self.assertTrue(np.array_equal(a, b)) + self.assertTrue(ht.equal(a, b)) # Two arrays with the same seed and shape have identical values ht.random.seed(13579) a = ht.random.randint(10000, size=shape, split=2, dtype=ht.int64) - a = a.numpy() ht.random.seed(13579) b = ht.random.randint(low=0, high=10000, size=shape, split=2, dtype=ht.int64) - b = b.numpy() ht.random.seed(13579) c = ht.random.randint(low=0, high=10000, dtype=ht.int64) - self.assertTrue(np.equal(b[0, 0, 0, 0, 0], c)) + self.assertTrue(ht.equal(b[0, 0, 0, 0, 0], c)) - self.assertTrue(np.array_equal(a, b)) - mean = np.mean(a) - median = np.median(a) - std = np.std(a) + self.assertTrue(ht.equal(a, b)) + mean = ht.mean(a) + # median = np.median(a) + std = ht.std(a) # Mean and median should be in the center while the std is very high due to an even distribution self.assertTrue(4900 < mean < 5100) - self.assertTrue(4900 < median < 5100) + # self.assertTrue(4900 < median < 5100) self.assertTrue(std < 2900) with self.assertRaises(ValueError): @@ -746,31 +728,26 @@ def test_randint(self): self.assertEqual(a.dtype, ht.int32) self.assertEqual(a.larray.dtype, torch.int32) self.assertEqual(b.dtype, ht.int32) - a = a.numpy() - b = b.numpy() - self.assertEqual(a.dtype, np.int32) - self.assertTrue(np.array_equal(a, b)) + self.assertTrue(ht.equal(a, b)) self.assertTrue(((50 <= a) & (a < 1000)).all()) self.assertTrue(((50 <= b) & (b < 1000)).all()) c = ht.random.randint(50, 1000, size=(13, 45), dtype=ht.int32, split=0) - c = c.numpy() - self.assertFalse(np.array_equal(a, c)) - self.assertFalse(np.array_equal(b, c)) + self.assertFalse(ht.equal(a, c)) + self.assertFalse(ht.equal(b, c)) self.assertTrue(((50 <= c) & (c < 1000)).all()) ht.random.seed(0xFFFFFFF) a = ht.random.randint( 10000, size=(123, 42, 13, 21), split=3, dtype=ht.int32, comm=ht.MPI_WORLD ) - a = a.numpy() - mean = np.mean(a) - median = np.median(a) - std = np.std(a) + mean = ht.mean(a) + # median = np.median(a) + std = ht.std(a) # Mean and median should be in the center while the std is very high due to an even distribution self.assertTrue(4900 < mean < 5100) - self.assertTrue(4900 < median < 5100) + # self.assertTrue(4900 < median < 5100) self.assertTrue(std < 2900) # test aliases @@ -826,22 +803,20 @@ def test_randn(self): a = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2) self.assertEqual(a.dtype, ht.float32) self.assertEqual(a.larray[0, 0, 0].dtype, torch.float32) - a = a.numpy() - self.assertEqual(a.dtype, np.float32) - mean = np.mean(a) - median = np.median(a) - std = np.std(a) + mean = ht.mean(a) + # median = np.median(a) + std = ht.std(a) self.assertTrue(-0.01 < mean < 0.01) - self.assertTrue(-0.01 < median < 0.01) + # self.assertTrue(-0.01 < median < 0.01) self.assertTrue(0.99 < std < 1.01) ht.random.set_state(("Threefry", 54321, 0x10000000000000000)) - b = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2).numpy() - self.assertTrue(np.allclose(a, b)) + b = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2) + self.assertTrue(ht.allclose(a, b)) - c = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2).numpy() - self.assertFalse(np.allclose(a, c)) - self.assertFalse(np.allclose(b, c)) + c = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2) + self.assertFalse(ht.allclose(a, c)) + self.assertFalse(ht.allclose(b, c)) def test_randperm(self): ht.random.set_state(("Threefry", 0, 0)) diff --git a/heat/utils/data/matrixgallery.py b/heat/utils/data/matrixgallery.py index c94d283141..47b6c0b19c 100644 --- a/heat/utils/data/matrixgallery.py +++ b/heat/utils/data/matrixgallery.py @@ -61,7 +61,7 @@ def hermitian( matrix = randn(n, n, dtype=real_dtype, split=split, device=device, comm=comm) + 1j * randn( n, n, dtype=real_dtype, split=split, device=device, comm=comm ) - elif not heat_type_is_exact(dtype): + elif dtype in [core.float32, core.float64]: matrix = randn(n, n, dtype=dtype, split=split, device=device, comm=comm) else: raise ValueError("dtype must be floating-point data-type but is ", dtype, ".")