From ec9c4c4cd61576630b5c2803f3b68d9070fa3c2a Mon Sep 17 00:00:00 2001 From: Scott Stevenson Date: Tue, 3 Dec 2024 22:19:49 +0000 Subject: [PATCH] Set `epoch_seed_change` attribute on `SimulationDataset` This was added to the `StreamingDataset` which the `SimulationDataset` inherits, so also needed to be added here. Without this, the code attempts to access the missing attribute when running a simulation: ``` AttributeError: 'SimulationDataset' object has no attribute 'epoch_seed_change' Traceback: File "/home/scott/projects/streaming/.venv/lib64/python3.12/site-packages/streamlit/runtime/scriptrunner/exec_code.py", line 88, in exec_func_with_error_handling result = func() ^^^^^^ File "/home/scott/projects/streaming/.venv/lib64/python3.12/site-packages/streamlit/runtime/scriptrunner/script_runner.py", line 579, in code_to_exec exec(code, module.__dict__) File "/home/scott/projects/streaming/simulation/interfaces/sim_ui.py", line 409, in submit_jobs(shuffle_quality, dataset, time_per_sample, node_internet_bandwidth, File "/home/scott/projects/streaming/simulation/interfaces/sim_ui.py", line 110, in submit_jobs for output in gen_sim: ^^^^^^^ File "/home/scott/projects/streaming/simulation/core/main.py", line 110, in simulate samples_per_node = dataset.get_samples_per_node(epoch, 0) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/scott/projects/streaming/simulation/core/sim_dataset.py", line 367, in get_samples_per_node partition = generate_work(self.batching_method, self, self.world, epoch, sample_in_epoch) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/scott/projects/streaming/streaming/base/batching/__init__.py", line 45, in generate_work return get(dataset, world, epoch, sample_in_epoch) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/scott/projects/streaming/streaming/base/batching/random.py", line 49, in generate_work_random_batching shuffle_units, small_per_big = dataset.resample_streams(epoch) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/scott/projects/streaming/streaming/base/dataset.py", line 878, in resample_streams epoch, self.epoch_seed_change) ^^^^^^^^^^^^^^^^^^^^^^ ``` Closes https://github.com/mosaicml/streaming/issues/831 --- simulation/core/sim_dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/simulation/core/sim_dataset.py b/simulation/core/sim_dataset.py index fe1036ea3..f093106df 100644 --- a/simulation/core/sim_dataset.py +++ b/simulation/core/sim_dataset.py @@ -201,6 +201,9 @@ def __init__(self, if epoch_size_value < 0: raise ValueError(f'Epoch size cannot be negative. Received {epoch_size_value}.') + # Determine if we should be changing the seed every epoch + self.epoch_seed_change = self.shuffle and self.sampling_method == 'balanced' + # Initialize the Stream defaults and normalize to a list of Streams. if streams: default = {