From 68813b762309677d9ef0c157c529a4b650533976 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20S=C3=A1nchez-Gallego?= Date: Mon, 30 Dec 2024 10:27:43 -0800 Subject: [PATCH 1/3] Monitor internet and LCO connectivity --- src/gort/etc/lvmgort.yml | 2 ++ src/gort/overwatcher/alerts.py | 25 +++++++++++++-- src/gort/overwatcher/overwatcher.py | 49 ++++++++++++++++------------- src/gort/tools.py | 4 +-- 4 files changed, 53 insertions(+), 27 deletions(-) diff --git a/src/gort/etc/lvmgort.yml b/src/gort/etc/lvmgort.yml index 9919a50..a9143fa 100644 --- a/src/gort/etc/lvmgort.yml +++ b/src/gort/etc/lvmgort.yml @@ -282,6 +282,8 @@ services: notifications: gortdb.notification overwatcher: + lock_timeout_on_unsafe: 1800 + scheduler: open_dome_buffer: 300 diff --git a/src/gort/overwatcher/alerts.py b/src/gort/overwatcher/alerts.py index a389247..4c06191 100644 --- a/src/gort/overwatcher/alerts.py +++ b/src/gort/overwatcher/alerts.py @@ -42,6 +42,8 @@ class AlertsSummary(BaseModel): o2_room_alerts: dict[str, bool] | None = None heater_alert: bool | None = None heater_camera_alerts: dict[str, bool] | None = None + lco_connectivity: bool | None = None + internet_connectivity: bool | None = None class ActiveAlert(enum.Flag): @@ -56,6 +58,7 @@ class ActiveAlert(enum.Flag): O2 = enum.auto() LOCKED = enum.auto() UNAVAILABLE = enum.auto() + DISCONNECTED = enum.auto() UNKNOWN = enum.auto() @@ -166,6 +169,16 @@ def is_safe(self) -> tuple[bool, ActiveAlert]: active_alerts |= ActiveAlert.WIND is_safe = False + if not self.state.internet_connectivity: + self.log.warning("Internet connectivity lost.") + active_alerts |= ActiveAlert.DISCONNECTED + is_safe = False + + if not self.state.lco_connectivity: + self.log.warning("Internal LCO connectivity lost.") + active_alerts |= ActiveAlert.DISCONNECTED + is_safe = False + # These alerts are not critical but we log them. # TODO: maybe we do want to do something about these alerts. if self.state.door_alert: @@ -183,7 +196,8 @@ def is_safe(self) -> tuple[bool, ActiveAlert]: # and put a lock for 30 minutes to prevent the dome from opening/closing too # frequently if the weather is unstable. if self.locked_until > 0 and time() < self.locked_until: - return False, active_alerts | ActiveAlert.LOCKED + is_safe = False + active_alerts |= ActiveAlert.LOCKED if is_safe: self.locked_until = 0 @@ -194,6 +208,11 @@ def is_safe(self) -> tuple[bool, ActiveAlert]: async def get_alerts_summary(self) -> AlertsSummary: """Returns the alerts report.""" - data = await get_lvmapi_route("/alerts/summary") + alerts_data = await get_lvmapi_route("/alerts/summary") + summary = AlertsSummary(**alerts_data) + + connectivity_data = await get_lvmapi_route("/alerts/connectivity") + summary.lco_connectivity = connectivity_data["lco_connectivity"] + summary.internet_connectivity = connectivity_data["internet"] - return AlertsSummary(**data) + return summary diff --git a/src/gort/overwatcher/overwatcher.py b/src/gort/overwatcher/overwatcher.py index 9e4298c..24526ff 100644 --- a/src/gort/overwatcher/overwatcher.py +++ b/src/gort/overwatcher/overwatcher.py @@ -129,71 +129,76 @@ async def task(self): async def handle_unsafe(self): """Closes the dome if the conditions are unsafe.""" - closed = await self.overwatcher.dome.is_closing() + ow = self.overwatcher - observing = self.overwatcher.observer.is_observing - cancelling = self.overwatcher.observer.is_cancelling - calibrating = self.overwatcher.state.calibrating + closed = await ow.dome.is_closing() + + observing = ow.observer.is_observing + cancelling = ow.observer.is_cancelling + calibrating = ow.state.calibrating - _, alerts_status = self.overwatcher.alerts.is_safe() + _, alerts_status = ow.alerts.is_safe() is_raining = bool(alerts_status & ActiveAlert.RAIN) if not closed or observing or calibrating: try: async with asyncio.timeout(delay=30): - await self.overwatcher.notify( - "Unsafe conditions detected.", - level="warning", - ) + await ow.notify("Unsafe conditions detected.", level="warning") if observing and not cancelling: try: - await self.overwatcher.observer.stop_observing( + await ow.observer.stop_observing( immediate=True, reason="unsafe conditions", ) except Exception as err: - await self.overwatcher.notify( + await ow.notify( f"Error stopping observing: {decap(err)}", level="error", ) - await self.overwatcher.notify( + await ow.notify( "I will close the dome anyway.", level="warning", ) if calibrating: - await self.overwatcher.calibrations.cancel() + await ow.calibrations.cancel() except asyncio.TimeoutError: - await self.overwatcher.notify( + await ow.notify( "Timed out while handling unsafe conditions.", level="error", ) except Exception as err: - await self.overwatcher.notify( + await ow.notify( f"Error handling unsafe conditions: {decap(err)}", level="error", ) finally: if not closed: - await self.overwatcher.notify("Closing the dome.") - await self.overwatcher.dome.shutdown(retry=True, park=True) + await ow.notify("Closing the dome due to unsafe conditions.") + await ow.dome.shutdown(retry=True, park=True) # If we have to close because of unsafe conditions, we don't want - # to reopen too soon. We lock the dome for 30 minutes. - self.overwatcher.alerts.locked_until = time() + 1800 + # to reopen too soon. We lock the dome for some time. + timeout = ow.config["overwatcher.lock_timeout_on_unsafe"] + ow.alerts.locked_until = time() + timeout + + await ow.notify( + f"The dome will be locked for {int(timeout)} seconds.", + level="warning", + ) - if self.overwatcher.state.enabled: + if ow.state.enabled: if is_raining: - await self.overwatcher.notify( + await ow.notify( "Disabling the Overwatcher due to rain. " "Manually re-enable it when safe.", level="warning", ) - self.overwatcher.state.enabled = False + ow.state.enabled = False async def handle_daytime(self): """Handles daytime.""" diff --git a/src/gort/tools.py b/src/gort/tools.py index 395fece..5502a9d 100644 --- a/src/gort/tools.py +++ b/src/gort/tools.py @@ -741,8 +741,8 @@ async def get_lvmapi_route(route: str, params: dict = {}, **kwargs): ) as client: response = await client.get(route, params=params) - if response.status_code != 200: - raise ValueError(f"Route {route} failed with error {response.status_code}.") + if (code := response.status_code) != 200: + raise ValueError(f"Route /{route} failed with error {code}.") return response.json() From 7d74b5d0cd3b9019c165f772837a42c57aae23bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20S=C3=A1nchez-Gallego?= Date: Mon, 30 Dec 2024 11:03:37 -0800 Subject: [PATCH 2/3] Use triggers for connectivity status --- src/gort/overwatcher/alerts.py | 41 +++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/src/gort/overwatcher/alerts.py b/src/gort/overwatcher/alerts.py index 4c06191..b358952 100644 --- a/src/gort/overwatcher/alerts.py +++ b/src/gort/overwatcher/alerts.py @@ -14,7 +14,7 @@ from typing import TYPE_CHECKING -from lvmopstools.retrier import Retrier +from lvmopstools.utils import Trigger from pydantic import BaseModel from gort.overwatcher.core import OverwatcherModule, OverwatcherModuleTask @@ -42,8 +42,14 @@ class AlertsSummary(BaseModel): o2_room_alerts: dict[str, bool] | None = None heater_alert: bool | None = None heater_camera_alerts: dict[str, bool] | None = None - lco_connectivity: bool | None = None - internet_connectivity: bool | None = None + + +class ConnectivityStatus: + """Status of the connectivity.""" + + def __init__(self): + self.lco = Trigger(n=3) + self.internet = Trigger(n=3) class ActiveAlert(enum.Flag): @@ -69,6 +75,8 @@ class AlertsMonitorTask(OverwatcherModuleTask["AlertsOverwatcher"]): keep_alive = True restart_on_error = True + INTERVAL: float = 20 + async def task(self): """Updates the alerts data.""" @@ -94,12 +102,12 @@ async def task(self): level="critical", ) - await asyncio.sleep(15) + await asyncio.sleep(self.INTERVAL) async def update_alerts(self): """Processes the weather update and determines whether it is safe to observe.""" - data = await self.module.get_alerts_summary() + data = await self.module.update_status() if data is None: raise ValueError("No alerts data available.") @@ -121,6 +129,7 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.state: AlertsSummary | None = None + self.connectivity = ConnectivityStatus() self.last_updated: float = 0.0 self.unavailable: bool = False @@ -169,12 +178,12 @@ def is_safe(self) -> tuple[bool, ActiveAlert]: active_alerts |= ActiveAlert.WIND is_safe = False - if not self.state.internet_connectivity: + if not self.connectivity.internet.is_set(): self.log.warning("Internet connectivity lost.") active_alerts |= ActiveAlert.DISCONNECTED is_safe = False - if not self.state.lco_connectivity: + if not self.connectivity.lco.is_set(): self.log.warning("Internal LCO connectivity lost.") active_alerts |= ActiveAlert.DISCONNECTED is_safe = False @@ -204,15 +213,25 @@ def is_safe(self) -> tuple[bool, ActiveAlert]: return is_safe, active_alerts - @Retrier(max_attempts=3, delay=5) - async def get_alerts_summary(self) -> AlertsSummary: + async def update_status(self) -> AlertsSummary: """Returns the alerts report.""" alerts_data = await get_lvmapi_route("/alerts/summary") summary = AlertsSummary(**alerts_data) + # For connectivity we want to avoid one single failure to trigger an alert + # which closes the dome. The connectivity status is a set of triggers that + # need several settings to be activated. connectivity_data = await get_lvmapi_route("/alerts/connectivity") - summary.lco_connectivity = connectivity_data["lco_connectivity"] - summary.internet_connectivity = connectivity_data["internet"] + + if not connectivity_data["lco"]: + self.connectivity.lco.set() + else: + self.connectivity.lco.reset() + + if not connectivity_data["internet"]: + self.connectivity.internet.set() + else: + self.connectivity.internet.reset() return summary From 181968775b2ae7bf5cdf1a0dabba096314b0dee3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20S=C3=A1nchez-Gallego?= Date: Mon, 30 Dec 2024 11:19:34 -0800 Subject: [PATCH 3/3] Updated CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1fcaf73..f3cb3c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### ✨ Improved * [#44](https://vscode.dev/github/sdss/lvmgort/pull/44) RORR RID-019: disables the Overwatcher if rain is detected and requires a human to re-enable it when conditions are safe. +* [#46](https://github.com/sdss/lvmgort/pull/46) RORR RID-017: treat lost connectivity to the internet or LCO as an unsafe condition and close. * Create the night log during the pre-observing task. ### 🏷️ Changed