Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Monitor internet and LCO connectivity #46

Merged
merged 3 commits into from
Dec 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
### ✨ Improved

* [#44](https://vscode.dev/github/sdss/lvmgort/pull/44) RORR RID-019: disables the Overwatcher if rain is detected and requires a human to re-enable it when conditions are safe.
* [#46](https://github.com/sdss/lvmgort/pull/46) RORR RID-017: treat lost connectivity to the internet or LCO as an unsafe condition and close.
* Create the night log during the pre-observing task.

### 🏷️ Changed
Expand Down
2 changes: 2 additions & 0 deletions src/gort/etc/lvmgort.yml
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,8 @@ services:
notifications: gortdb.notification

overwatcher:
lock_timeout_on_unsafe: 1800

scheduler:
open_dome_buffer: 300

Expand Down
54 changes: 46 additions & 8 deletions src/gort/overwatcher/alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from typing import TYPE_CHECKING

from lvmopstools.retrier import Retrier
from lvmopstools.utils import Trigger
from pydantic import BaseModel

from gort.overwatcher.core import OverwatcherModule, OverwatcherModuleTask
Expand Down Expand Up @@ -44,6 +44,14 @@ class AlertsSummary(BaseModel):
heater_camera_alerts: dict[str, bool] | None = None


class ConnectivityStatus:
"""Status of the connectivity."""

def __init__(self):
self.lco = Trigger(n=3)
self.internet = Trigger(n=3)


class ActiveAlert(enum.Flag):
"""Flags for active alerts."""

Expand All @@ -56,6 +64,7 @@ class ActiveAlert(enum.Flag):
O2 = enum.auto()
LOCKED = enum.auto()
UNAVAILABLE = enum.auto()
DISCONNECTED = enum.auto()
UNKNOWN = enum.auto()


Expand All @@ -66,6 +75,8 @@ class AlertsMonitorTask(OverwatcherModuleTask["AlertsOverwatcher"]):
keep_alive = True
restart_on_error = True

INTERVAL: float = 20

async def task(self):
"""Updates the alerts data."""

Expand All @@ -91,12 +102,12 @@ async def task(self):
level="critical",
)

await asyncio.sleep(15)
await asyncio.sleep(self.INTERVAL)

async def update_alerts(self):
"""Processes the weather update and determines whether it is safe to observe."""

data = await self.module.get_alerts_summary()
data = await self.module.update_status()

if data is None:
raise ValueError("No alerts data available.")
Expand All @@ -118,6 +129,7 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self.state: AlertsSummary | None = None
self.connectivity = ConnectivityStatus()

self.last_updated: float = 0.0
self.unavailable: bool = False
Expand Down Expand Up @@ -166,6 +178,16 @@ def is_safe(self) -> tuple[bool, ActiveAlert]:
active_alerts |= ActiveAlert.WIND
is_safe = False

if not self.connectivity.internet.is_set():
self.log.warning("Internet connectivity lost.")
active_alerts |= ActiveAlert.DISCONNECTED
is_safe = False

if not self.connectivity.lco.is_set():
self.log.warning("Internal LCO connectivity lost.")
active_alerts |= ActiveAlert.DISCONNECTED
is_safe = False

# These alerts are not critical but we log them.
# TODO: maybe we do want to do something about these alerts.
if self.state.door_alert:
Expand All @@ -183,17 +205,33 @@ def is_safe(self) -> tuple[bool, ActiveAlert]:
# and put a lock for 30 minutes to prevent the dome from opening/closing too
# frequently if the weather is unstable.
if self.locked_until > 0 and time() < self.locked_until:
return False, active_alerts | ActiveAlert.LOCKED
is_safe = False
active_alerts |= ActiveAlert.LOCKED

if is_safe:
self.locked_until = 0

return is_safe, active_alerts

@Retrier(max_attempts=3, delay=5)
async def get_alerts_summary(self) -> AlertsSummary:
async def update_status(self) -> AlertsSummary:
"""Returns the alerts report."""

data = await get_lvmapi_route("/alerts/summary")
alerts_data = await get_lvmapi_route("/alerts/summary")
summary = AlertsSummary(**alerts_data)

# For connectivity we want to avoid one single failure to trigger an alert
# which closes the dome. The connectivity status is a set of triggers that
# need several settings to be activated.
connectivity_data = await get_lvmapi_route("/alerts/connectivity")

if not connectivity_data["lco"]:
self.connectivity.lco.set()
else:
self.connectivity.lco.reset()

if not connectivity_data["internet"]:
self.connectivity.internet.set()
else:
self.connectivity.internet.reset()

return AlertsSummary(**data)
return summary
49 changes: 27 additions & 22 deletions src/gort/overwatcher/overwatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,71 +129,76 @@ async def task(self):
async def handle_unsafe(self):
"""Closes the dome if the conditions are unsafe."""

closed = await self.overwatcher.dome.is_closing()
ow = self.overwatcher

observing = self.overwatcher.observer.is_observing
cancelling = self.overwatcher.observer.is_cancelling
calibrating = self.overwatcher.state.calibrating
closed = await ow.dome.is_closing()

observing = ow.observer.is_observing
cancelling = ow.observer.is_cancelling
calibrating = ow.state.calibrating

_, alerts_status = self.overwatcher.alerts.is_safe()
_, alerts_status = ow.alerts.is_safe()
is_raining = bool(alerts_status & ActiveAlert.RAIN)

if not closed or observing or calibrating:
try:
async with asyncio.timeout(delay=30):
await self.overwatcher.notify(
"Unsafe conditions detected.",
level="warning",
)
await ow.notify("Unsafe conditions detected.", level="warning")

if observing and not cancelling:
try:
await self.overwatcher.observer.stop_observing(
await ow.observer.stop_observing(
immediate=True,
reason="unsafe conditions",
)
except Exception as err:
await self.overwatcher.notify(
await ow.notify(
f"Error stopping observing: {decap(err)}",
level="error",
)
await self.overwatcher.notify(
await ow.notify(
"I will close the dome anyway.",
level="warning",
)

if calibrating:
await self.overwatcher.calibrations.cancel()
await ow.calibrations.cancel()

except asyncio.TimeoutError:
await self.overwatcher.notify(
await ow.notify(
"Timed out while handling unsafe conditions.",
level="error",
)

except Exception as err:
await self.overwatcher.notify(
await ow.notify(
f"Error handling unsafe conditions: {decap(err)}",
level="error",
)

finally:
if not closed:
await self.overwatcher.notify("Closing the dome.")
await self.overwatcher.dome.shutdown(retry=True, park=True)
await ow.notify("Closing the dome due to unsafe conditions.")
await ow.dome.shutdown(retry=True, park=True)

# If we have to close because of unsafe conditions, we don't want
# to reopen too soon. We lock the dome for 30 minutes.
self.overwatcher.alerts.locked_until = time() + 1800
# to reopen too soon. We lock the dome for some time.
timeout = ow.config["overwatcher.lock_timeout_on_unsafe"]
ow.alerts.locked_until = time() + timeout

await ow.notify(
f"The dome will be locked for {int(timeout)} seconds.",
level="warning",
)

if self.overwatcher.state.enabled:
if ow.state.enabled:
if is_raining:
await self.overwatcher.notify(
await ow.notify(
"Disabling the Overwatcher due to rain. "
"Manually re-enable it when safe.",
level="warning",
)
self.overwatcher.state.enabled = False
ow.state.enabled = False

async def handle_daytime(self):
"""Handles daytime."""
Expand Down
4 changes: 2 additions & 2 deletions src/gort/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -741,8 +741,8 @@ async def get_lvmapi_route(route: str, params: dict = {}, **kwargs):
) as client:
response = await client.get(route, params=params)

if response.status_code != 200:
raise ValueError(f"Route {route} failed with error {response.status_code}.")
if (code := response.status_code) != 200:
raise ValueError(f"Route /{route} failed with error {code}.")

return response.json()

Expand Down