Skip to content

Commit

Permalink
Reduce isvc fail timeout, fix TCP_PROTOCOLS constant, add Error r…
Browse files Browse the repository at this point in the history
…o exception class names (#95)

* Create size-labeler.yml

* Delete .github/workflows/size-labeler.yml

* add fail fast

* add fail fast

* fix const

* fix arg name
  • Loading branch information
rnetser authored Jan 16, 2025
1 parent 29e7d23 commit 793b4db
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 22 deletions.
6 changes: 3 additions & 3 deletions tests/model_registry/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from ocp_resources.model_registry import ModelRegistry
from kubernetes.dynamic.exceptions import ResourceNotFoundError

from utilities.exceptions import ProtocolNotSupported, TooManyServices
from utilities.exceptions import ProtocolNotSupportedError, TooManyServicesError
from utilities.constants import Protocols, HTTPRequest
from tests.model_registry.constants import ModelRegistryEndpoints

Expand Down Expand Up @@ -35,15 +35,15 @@ def get_mr_service_by_label(client: DynamicClient, ns: Namespace, mr_instance: M
]:
if len(svc) == 1:
return svc[0]
raise TooManyServices(svc)
raise TooManyServicesError(svc)
raise ResourceNotFoundError(f"{mr_instance.name} has no Service")


def get_endpoint_from_mr_service(client: DynamicClient, svc: Service, protocol: str) -> str:
if protocol in (Protocols.REST, Protocols.GRPC):
return svc.instance.metadata.annotations[f"{ADDRESS_ANNOTATION_PREFIX}{protocol}"]
else:
raise ProtocolNotSupported(protocol)
raise ProtocolNotSupportedError(protocol)


def generate_register_model_command(endpoint: str, token: str) -> str:
Expand Down
4 changes: 2 additions & 2 deletions tests/model_serving/model_server/private_endpoint/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from simple_logger.logger import get_logger

from utilities.constants import Protocols
from utilities.exceptions import ProtocolNotSupported
from utilities.exceptions import ProtocolNotSupportedError

LOGGER = get_logger(name=__name__)

Expand All @@ -21,7 +21,7 @@ def curl_from_pod(
protocol: str = Protocols.HTTP,
) -> str:
if protocol not in (Protocols.HTTPS, Protocols.HTTP):
raise ProtocolNotSupported(protocol)
raise ProtocolNotSupportedError(protocol)
host = isvc.instance.status.address.url
if protocol == "http":
parsed = urlparse(host)
Expand Down
54 changes: 46 additions & 8 deletions tests/model_serving/model_server/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,52 @@
from kubernetes.dynamic import DynamicClient
from ocp_resources.inference_service import InferenceService
from simple_logger.logger import get_logger
from timeout_sampler import TimeoutSampler

from utilities.constants import KServeDeploymentType
from utilities.exceptions import InferenceResponseError, InvalidStorageArgument
from utilities.exceptions import FailedPodsError, InferenceResponseError, InvalidStorageArgumentError
from utilities.inference_utils import UserInference
from utilities.infra import wait_for_inference_deployment_replicas
from utilities.infra import (
get_pods_by_isvc_label,
wait_for_inference_deployment_replicas,
)

LOGGER = get_logger(name=__name__)


def verify_no_failed_pods(client: DynamicClient, isvc: InferenceService) -> None:
failed_pods: dict[str, Any] = {}

for pods in TimeoutSampler(
wait_timeout=5 * 60,
sleep=10,
func=get_pods_by_isvc_label,
client=client,
isvc=isvc,
):
if pods:
if all([pod.instance.status.phase == pod.Status.RUNNING for pod in pods]):
return

for pod in pods:
pod_status = pod.instance.status
if init_container_status := pod_status.initContainerStatuses:
if container_terminated := init_container_status[0].lastState.terminated:
if container_terminated.reason == "Error":
failed_pods[pod.name] = pod_status

elif pod_status.phase in (
pod.Status.CRASH_LOOPBACK_OFF,
pod.Status.FAILED,
pod.Status.IMAGE_PULL_BACK_OFF,
pod.Status.ERR_IMAGE_PULL,
):
failed_pods[pod.name] = pod_status

if failed_pods:
raise FailedPodsError(pods=failed_pods)


@contextmanager
def create_isvc(
client: DynamicClient,
Expand Down Expand Up @@ -113,18 +150,19 @@ def create_isvc(
predictor=predictor_dict,
label=labels,
) as inference_service:
if wait_for_predictor_pods:
verify_no_failed_pods(client=client, isvc=inference_service)
wait_for_inference_deployment_replicas(
client=client, isvc=inference_service, deployment_mode=deployment_mode
)

if wait:
inference_service.wait_for_condition(
condition=inference_service.Condition.READY,
status=inference_service.Condition.Status.TRUE,
timeout=15 * 60,
)

if wait_for_predictor_pods:
wait_for_inference_deployment_replicas(
client=client, isvc=inference_service, deployment_mode=deployment_mode
)

yield inference_service


Expand All @@ -134,7 +172,7 @@ def _check_storage_arguments(
storage_path: Optional[str],
) -> None:
if (storage_uri and storage_path) or (not storage_uri and not storage_key) or (storage_key and not storage_path):
raise InvalidStorageArgument(storage_uri, storage_key, storage_path)
raise InvalidStorageArgumentError(storage_uri=storage_uri, storage_key=storage_key, storage_path=storage_path)


def verify_inference_response(
Expand Down
4 changes: 2 additions & 2 deletions utilities/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ class Protocols:
HTTPS: str = "https"
GRPC: str = "grpc"
REST: str = "rest"
TCP_PROTOCOLS: set[str] = {"HTTP", "HTTPS"}
ALL_SUPPORTED_PROTOCOLS: set[str] = TCP_PROTOCOLS.union({"GRPC"})
TCP_PROTOCOLS: set[str] = {HTTP, HTTPS}
ALL_SUPPORTED_PROTOCOLS: set[str] = TCP_PROTOCOLS.union({GRPC})


class HTTPRequest:
Expand Down
22 changes: 15 additions & 7 deletions utilities/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
from ocp_resources.service import Service


class ProtocolNotSupported(Exception):
class ProtocolNotSupportedError(Exception):
def __init__(self, protocol: str):
self.protocol = protocol

def __str__(self) -> str:
return f"Protocol {self.protocol} is not supported"


class TooManyServices(Exception):
class TooManyServicesError(Exception):
def __init__(self, services: list[Service]):
self.services = services

Expand All @@ -23,28 +23,36 @@ class InferenceResponseError(Exception):
pass


class InvalidStorageArgument(Exception):
class InvalidStorageArgumentError(Exception):
def __init__(
self,
storageUri: Optional[str],
storage_uri: Optional[str],
storage_key: Optional[str],
storage_path: Optional[str],
):
self.storageUri = storageUri
self.storage_uri = storage_uri
self.storage_key = storage_key
self.storage_path = storage_path

def __str__(self) -> str:
msg = f"""
You've passed the following parameters:
"storageUri": {self.storageUri}
"storage_uri": {self.storage_uri}
"storage_key": {self.storage_key}
"storage_path: {self.storage_path}
In order to create a valid ISVC you need to specify either a storageUri value
In order to create a valid ISVC you need to specify either a storage_uri value
or both a storage key and a storage path.
"""
return msg


class MetricValidationError(Exception):
pass


class FailedPodsError(Exception):
def __init__(self, pods: dict[str, str]):
self.pods = pods

def __str__(self) -> str:
return f"The following pods are not running: {self.pods}"

0 comments on commit 793b4db

Please sign in to comment.