Reduce isvc fail timeout, fix TCP_PROTOCOLS constant, add Error r…

…o exception class names (#95) * Create size-labeler.yml * Delete .github/workflows/size-labeler.yml * add fail fast * add fail fast * fix const * fix arg name
opendatahub-io · Jan 16, 2025 · 793b4db · 793b4db
1 parent 29e7d23
commit 793b4db
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 22 deletions.
diff --git a/tests/model_registry/utils.py b/tests/model_registry/utils.py
@@ -4,7 +4,7 @@
 from ocp_resources.model_registry import ModelRegistry
 from kubernetes.dynamic.exceptions import ResourceNotFoundError
 
-from utilities.exceptions import ProtocolNotSupported, TooManyServices
+from utilities.exceptions import ProtocolNotSupportedError, TooManyServicesError
 from utilities.constants import Protocols, HTTPRequest
 from tests.model_registry.constants import ModelRegistryEndpoints
 
@@ -35,15 +35,15 @@ def get_mr_service_by_label(client: DynamicClient, ns: Namespace, mr_instance: M
     ]:
         if len(svc) == 1:
             return svc[0]
-        raise TooManyServices(svc)
+        raise TooManyServicesError(svc)
     raise ResourceNotFoundError(f"{mr_instance.name} has no Service")
 
 
 def get_endpoint_from_mr_service(client: DynamicClient, svc: Service, protocol: str) -> str:
     if protocol in (Protocols.REST, Protocols.GRPC):
         return svc.instance.metadata.annotations[f"{ADDRESS_ANNOTATION_PREFIX}{protocol}"]
     else:
-        raise ProtocolNotSupported(protocol)
+        raise ProtocolNotSupportedError(protocol)
 
 
 def generate_register_model_command(endpoint: str, token: str) -> str:

diff --git a/tests/model_serving/model_server/private_endpoint/utils.py b/tests/model_serving/model_server/private_endpoint/utils.py
@@ -9,7 +9,7 @@
 from simple_logger.logger import get_logger
 
 from utilities.constants import Protocols
-from utilities.exceptions import ProtocolNotSupported
+from utilities.exceptions import ProtocolNotSupportedError
 
 LOGGER = get_logger(name=__name__)
 
@@ -21,7 +21,7 @@ def curl_from_pod(
     protocol: str = Protocols.HTTP,
 ) -> str:
     if protocol not in (Protocols.HTTPS, Protocols.HTTP):
-        raise ProtocolNotSupported(protocol)
+        raise ProtocolNotSupportedError(protocol)
     host = isvc.instance.status.address.url
     if protocol == "http":
         parsed = urlparse(host)

diff --git a/tests/model_serving/model_server/utils.py b/tests/model_serving/model_server/utils.py
@@ -7,15 +7,52 @@
 from kubernetes.dynamic import DynamicClient
 from ocp_resources.inference_service import InferenceService
 from simple_logger.logger import get_logger
+from timeout_sampler import TimeoutSampler
 
 from utilities.constants import KServeDeploymentType
-from utilities.exceptions import InferenceResponseError, InvalidStorageArgument
+from utilities.exceptions import FailedPodsError, InferenceResponseError, InvalidStorageArgumentError
 from utilities.inference_utils import UserInference
-from utilities.infra import wait_for_inference_deployment_replicas
+from utilities.infra import (
+    get_pods_by_isvc_label,
+    wait_for_inference_deployment_replicas,
+)
 
 LOGGER = get_logger(name=__name__)
 
 
+def verify_no_failed_pods(client: DynamicClient, isvc: InferenceService) -> None:
+    failed_pods: dict[str, Any] = {}
+
+    for pods in TimeoutSampler(
+        wait_timeout=5 * 60,
+        sleep=10,
+        func=get_pods_by_isvc_label,
+        client=client,
+        isvc=isvc,
+    ):
+        if pods:
+            if all([pod.instance.status.phase == pod.Status.RUNNING for pod in pods]):
+                return
+
+            for pod in pods:
+                pod_status = pod.instance.status
+                if init_container_status := pod_status.initContainerStatuses:
+                    if container_terminated := init_container_status[0].lastState.terminated:
+                        if container_terminated.reason == "Error":
+                            failed_pods[pod.name] = pod_status
+
+                elif pod_status.phase in (
+                    pod.Status.CRASH_LOOPBACK_OFF,
+                    pod.Status.FAILED,
+                    pod.Status.IMAGE_PULL_BACK_OFF,
+                    pod.Status.ERR_IMAGE_PULL,
+                ):
+                    failed_pods[pod.name] = pod_status
+
+            if failed_pods:
+                raise FailedPodsError(pods=failed_pods)
+
+
 @contextmanager
 def create_isvc(
     client: DynamicClient,
@@ -113,18 +150,19 @@ def create_isvc(
         predictor=predictor_dict,
         label=labels,
     ) as inference_service:
+        if wait_for_predictor_pods:
+            verify_no_failed_pods(client=client, isvc=inference_service)
+            wait_for_inference_deployment_replicas(
+                client=client, isvc=inference_service, deployment_mode=deployment_mode
+            )
+
         if wait:
             inference_service.wait_for_condition(
                 condition=inference_service.Condition.READY,
                 status=inference_service.Condition.Status.TRUE,
                 timeout=15 * 60,
             )
 
-        if wait_for_predictor_pods:
-            wait_for_inference_deployment_replicas(
-                client=client, isvc=inference_service, deployment_mode=deployment_mode
-            )
-
         yield inference_service
 
 
@@ -134,7 +172,7 @@ def _check_storage_arguments(
     storage_path: Optional[str],
 ) -> None:
     if (storage_uri and storage_path) or (not storage_uri and not storage_key) or (storage_key and not storage_path):
-        raise InvalidStorageArgument(storage_uri, storage_key, storage_path)
+        raise InvalidStorageArgumentError(storage_uri=storage_uri, storage_key=storage_key, storage_path=storage_path)
 
 
 def verify_inference_response(

diff --git a/utilities/constants.py b/utilities/constants.py
@@ -90,8 +90,8 @@ class Protocols:
     HTTPS: str = "https"
     GRPC: str = "grpc"
     REST: str = "rest"
-    TCP_PROTOCOLS: set[str] = {"HTTP", "HTTPS"}
-    ALL_SUPPORTED_PROTOCOLS: set[str] = TCP_PROTOCOLS.union({"GRPC"})
+    TCP_PROTOCOLS: set[str] = {HTTP, HTTPS}
+    ALL_SUPPORTED_PROTOCOLS: set[str] = TCP_PROTOCOLS.union({GRPC})
 
 
 class HTTPRequest:

diff --git a/utilities/exceptions.py b/utilities/exceptions.py
@@ -3,15 +3,15 @@
 from ocp_resources.service import Service
 
 
-class ProtocolNotSupported(Exception):
+class ProtocolNotSupportedError(Exception):
     def __init__(self, protocol: str):
         self.protocol = protocol
 
     def __str__(self) -> str:
         return f"Protocol {self.protocol} is not supported"
 
 
-class TooManyServices(Exception):
+class TooManyServicesError(Exception):
     def __init__(self, services: list[Service]):
         self.services = services
 
@@ -23,28 +23,36 @@ class InferenceResponseError(Exception):
     pass
 
 
-class InvalidStorageArgument(Exception):
+class InvalidStorageArgumentError(Exception):
     def __init__(
         self,
-        storageUri: Optional[str],
+        storage_uri: Optional[str],
         storage_key: Optional[str],
         storage_path: Optional[str],
     ):
-        self.storageUri = storageUri
+        self.storage_uri = storage_uri
         self.storage_key = storage_key
         self.storage_path = storage_path
 
     def __str__(self) -> str:
         msg = f"""
             You've passed the following parameters:
-            "storageUri": {self.storageUri}
+            "storage_uri": {self.storage_uri}
             "storage_key": {self.storage_key}
             "storage_path: {self.storage_path}
-            In order to create a valid ISVC you need to specify either a storageUri value
+            In order to create a valid ISVC you need to specify either a storage_uri value
             or both a storage key and a storage path.
         """
         return msg
 
 
 class MetricValidationError(Exception):
     pass
+
+
+class FailedPodsError(Exception):
+    def __init__(self, pods: dict[str, str]):
+        self.pods = pods
+
+    def __str__(self) -> str:
+        return f"The following pods are not running: {self.pods}"