diff --git a/chroma_core/models/client_mount.py b/chroma_core/models/client_mount.py index 5772f6e241..183f7ddb82 100644 --- a/chroma_core/models/client_mount.py +++ b/chroma_core/models/client_mount.py @@ -42,6 +42,10 @@ def get_deps(self, state=None): state = self.state deps = [] + + if self.host.immutable_state: + return DependAll(deps) + if state == "mounted": # Depend on this mount's host having LNet up. If LNet is stopped # on the host, this filesystem will be unmounted first. diff --git a/chroma_core/models/host.py b/chroma_core/models/host.py index 9c6391e6af..86975271ff 100644 --- a/chroma_core/models/host.py +++ b/chroma_core/models/host.py @@ -797,7 +797,7 @@ class BaseSetupHostJob(NullStateChangeJob): class Meta: abstract = True - def _common_deps(self, lnet_state_required, lnet_acceptable_states, lnet_unacceptable_states): + def _common_deps(self): # It really does not feel right that this is in here, but it does sort of work. These are the things # it is dependent on so create them. Also I can't work out with today's state machine anywhere else to # put them that works. @@ -826,23 +826,13 @@ def _common_deps(self, lnet_state_required, lnet_acceptable_states, lnet_unaccep deps = [] - if self.target_object.lnet_configuration: - deps.append( - DependOn( - self.target_object.lnet_configuration, - lnet_state_required, - lnet_acceptable_states, - lnet_unacceptable_states, - ) - ) - if self.target_object.pacemaker_configuration: deps.append(DependOn(self.target_object.pacemaker_configuration, "started")) if self.target_object.ntp_configuration: deps.append(DependOn(self.target_object.ntp_configuration, "configured")) - return DependAll(deps) + return deps class InitialiseBlockDeviceDriversStep(Step): @@ -871,7 +861,17 @@ def description(self): return help_text["setup_managed_host_on"] % self.target_object def get_deps(self): - return self._common_deps("lnet_up", None, None) + deps = self._common_deps() + + if self.target_object.lnet_configuration: + deps.append( + DependOn( + self.target_object.lnet_configuration, + "lnet_up" + ) + ) + + return DependAll(deps) def get_steps(self): return [(InitialiseBlockDeviceDriversStep, {"host": self.target_object})] @@ -891,9 +891,9 @@ class Meta: ordering = ["id"] def get_deps(self): - # Moving out of unconfigured into lnet_unloaded will mean that lnet will start monitoring and responding to - # the state. Once we start monitoring any state other than unconfigured is acceptable. - return self._common_deps("lnet_unloaded", None, ["unconfigured"]) + deps = self._common_deps() + + return DependAll(deps) def description(self): return help_text["setup_monitored_host_on"] % self.target_object @@ -913,14 +913,24 @@ class Meta: ordering = ["id"] def get_deps(self): - return self._common_deps("lnet_up", None, None) + deps = self._common_deps() + + if self.target_object.lnet_configuration and not self.target_object.immutable_state: + deps.append( + DependOn( + self.target_object.lnet_configuration, + "lnet_up" + ) + ) + + return DependAll(deps) def description(self): return help_text["setup_worker_host_on"] % self.target_object @classmethod def can_run(cls, host): - return host.is_managed and host.is_worker and (host.state != "unconfigured") + return host.is_worker and (host.state != "unconfigured") class DetectTargetsStep(Step): @@ -1174,6 +1184,9 @@ def description(self): def get_deps(self): deps = [] + if self.host.immutable_state: + return DependAll(deps) + if self.host.lnet_configuration: deps.append(DependOn(self.host.lnet_configuration, "unconfigured")) diff --git a/chroma_core/models/jobs.py b/chroma_core/models/jobs.py index 6698ca6208..02369155af 100644 --- a/chroma_core/models/jobs.py +++ b/chroma_core/models/jobs.py @@ -428,7 +428,6 @@ def all_deps(self, dep_cache): DependAll(dependent_deps), dep_cache.get(self), dep_cache.get(stateful_object, new_state), - DependOn(stateful_object, self.old_state), ) else: return dep_cache.get(self) diff --git a/chroma_core/services/job_scheduler/job_scheduler.py b/chroma_core/services/job_scheduler/job_scheduler.py index 2d22c2037f..5def84ba39 100644 --- a/chroma_core/services/job_scheduler/job_scheduler.py +++ b/chroma_core/services/job_scheduler/job_scheduler.py @@ -1106,14 +1106,17 @@ def key(td): # if we have an entry with 'root'=true then move it to the front of the list before returning the result return sorted(sorted_list, key=lambda entry: entry.get("root", False), reverse=True) - def create_client_mount(self, host_id, filesystem_name, mountpoint): + def create_client_mount(self, host_id, filesystem_id, mountpoint, existing): # RPC-callable host = ObjectCache.get_one(ManagedHost, lambda mh: mh.id == host_id) - mount = self._create_client_mount(host, filesystem_name, mountpoint) + filesystem = ObjectCache.get_one(ManagedFilesystem, lambda mf: mf.id == filesystem_id) + + mount = self._create_client_mount(host, filesystem, mountpoint, existing) + self.progress.advance() return mount.id - def _create_client_mount(self, host, filesystem_name, mountpoint): + def _create_client_mount(self, host, filesystem, mountpoint, existing = False): # Used for intra-JobScheduler calls log.debug("Creating client mount for %s as %s:%s" % (filesystem_name, host, mountpoint)) @@ -1121,7 +1124,10 @@ def _create_client_mount(self, host, filesystem_name, mountpoint): from django.db import transaction with transaction.atomic(): - mount, created = LustreClientMount.objects.get_or_create(host=host, filesystem=filesystem_name) + mount, created = LustreClientMount.objects.get_or_create(host=host, filesystem=filesystem) + if existing: + mount.state = "mounted" + mount.mountpoint = mountpoint mount.save() diff --git a/chroma_core/services/job_scheduler/job_scheduler_client.py b/chroma_core/services/job_scheduler/job_scheduler_client.py index 8091f3f986..0803ed0717 100644 --- a/chroma_core/services/job_scheduler/job_scheduler_client.py +++ b/chroma_core/services/job_scheduler/job_scheduler_client.py @@ -242,10 +242,10 @@ def create_targets(cls, targets_data): return (list(ManagedTarget.objects.filter(id__in=target_ids)), Command.objects.get(pk=command_id)) @classmethod - def create_client_mount(cls, host, filesystem_name, mountpoint): + def create_client_mount(cls, host, filesystem, mountpoint, existing): from chroma_core.models import LustreClientMount - client_mount_id = JobSchedulerRpc().create_client_mount(host.id, filesystem_name, mountpoint) + client_mount_id = JobSchedulerRpc().create_client_mount(host.id, filesystem.id, mountpoint, existing) return LustreClientMount.objects.get(id=client_mount_id) @classmethod diff --git a/chroma_core/services/lustre_audit/update_scan.py b/chroma_core/services/lustre_audit/update_scan.py index a0f375f101..5b880bc446 100755 --- a/chroma_core/services/lustre_audit/update_scan.py +++ b/chroma_core/services/lustre_audit/update_scan.py @@ -162,7 +162,8 @@ def update_client_mounts(self): log.info("updated mount %s on %s -> active" % (actual_mount["mountpoint"], self.host)) except IndexError: log.info("creating new mount %s on %s" % (actual_mount["mountpoint"], self.host)) - JobSchedulerClient.create_client_mount(self.host, fsname, actual_mount["mountpoint"]) + filesystem = ManagedFilesystem.objects.get(name=fsname) + JobSchedulerClient.create_client_mount(self.host, filesystem, actual_mount["mountpoint"], True) def update_target_mounts(self): # If mounts is None then nothing changed since the last update and so we can just return. diff --git a/chroma_core/services/plugin_runner/resource_manager.py b/chroma_core/services/plugin_runner/resource_manager.py index c4015ac436..608a4f5bd0 100644 --- a/chroma_core/services/plugin_runner/resource_manager.py +++ b/chroma_core/services/plugin_runner/resource_manager.py @@ -901,23 +901,19 @@ def _persist_nid_updates(self, scannable_id, changed_resource_id, changed_attrs) for lnet_state_resource in node_resources[LNETModules]: lnet_state = lnet_state_resource.to_resource() - # Really this code should be more tightly tied to the lnet_configuration classes, but in a one step - # at a time approach. Until lnet is !unconfigured we should not be updating it's state. - # Double if because the first if should be removed really, in some more perfect future. - if host.lnet_configuration.state != "unconfigured": - if lnet_state.host_id == host.id: - lnet_configuration = LNetConfiguration.objects.get(host=lnet_state.host_id) - - # Truthfully this should use the notify which I've added as a comment to show the correct way. The problem is that - # during the ConfigureLNetJob the state is changed to unloaded and this masks the notify in some way the is probably - # as planned but prevents it being set back. What we really need is to somehow get a single command that goes - # to a state and then to another state. post_dependencies almost. At present I can't see how to do this so I am leaving - # this code as is. - lnet_configuration.set_state(lnet_state.state) - lnet_configuration.save() - # JobSchedulerClient.notify(lnet_configuration, now(), {'state': lnet_state.state}) - - log.debug("_persist_nid_updates lnet_configuration %s" % lnet_configuration) + if lnet_state.host_id == host.id: + lnet_configuration = LNetConfiguration.objects.get(host=lnet_state.host_id) + + # Truthfully this should use the notify which I've added as a comment to show the correct way. The problem is that + # during the ConfigureLNetJob the state is changed to unloaded and this masks the notify in some way the is probably + # as planned but prevents it being set back. What we really need is to somehow get a single command that goes + # to a state and then to another state. post_dependencies almost. At present I can't see how to do this so I am leaving + # this code as is. + lnet_configuration.set_state(lnet_state.state) + lnet_configuration.save() + # JobSchedulerClient.notify(lnet_configuration, now(), {'state': lnet_state.state}) + + log.debug("_persist_nid_updates lnet_configuration %s" % lnet_configuration) # Only get the lnet_configuration if we actually have a LNetInterface (nid) to add. if len(node_resources[LNETInterface]) > 0: