-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add custom app exception handling & auto CUDA OOM exception han…
…dling (#274) * feat: add app exception handling * feat: add exception handlers * feat: add tests * refactor: use the old style * feat: check for cuda message too * fix: remove app start * refactor: use app client * fix: app client startup * fix: remove limit on log test
- Loading branch information
1 parent
5d6799d
commit c010931
Showing
6 changed files
with
173 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
from __future__ import annotations | ||
|
||
from ._base import FalServerlessException # noqa: F401 | ||
from ._base import AppException, FalServerlessException, FieldException # noqa: F401 | ||
from ._cuda import CUDAOutOfMemoryException # noqa: F401 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,50 @@ | ||
from __future__ import annotations | ||
|
||
from dataclasses import dataclass | ||
from typing import Sequence | ||
|
||
|
||
class FalServerlessException(Exception): | ||
"""Base exception type for fal Serverless related flows and APIs.""" | ||
|
||
pass | ||
|
||
|
||
@dataclass | ||
class AppException(FalServerlessException): | ||
""" | ||
Base exception class for application-specific errors. | ||
Attributes: | ||
message: A descriptive message explaining the error. | ||
status_code: The HTTP status code associated with the error. | ||
""" | ||
|
||
message: str | ||
status_code: int | ||
|
||
|
||
@dataclass | ||
class FieldException(FalServerlessException): | ||
"""Exception raised for errors related to specific fields. | ||
Attributes: | ||
field: The field that caused the error. | ||
message: A descriptive message explaining the error. | ||
status_code: The HTTP status code associated with the error. Defaults to 422 | ||
type: The type of error. Defaults to "value_error" | ||
""" | ||
|
||
field: str | ||
message: str | ||
status_code: int = 422 | ||
type: str = "value_error" | ||
|
||
def to_pydantic_format(self) -> Sequence[dict]: | ||
return [ | ||
{ | ||
"loc": ["body", self.field], | ||
"msg": self.message, | ||
"type": self.type, | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from __future__ import annotations | ||
|
||
from dataclasses import dataclass | ||
|
||
from ._base import AppException | ||
|
||
# PyTorch error message for out of memory | ||
_CUDA_OOM_MESSAGE = "CUDA error: out of memory" | ||
|
||
# Special status code for CUDA out of memory errors | ||
_CUDA_OOM_STATUS_CODE = 503 | ||
|
||
|
||
@dataclass | ||
class CUDAOutOfMemoryException(AppException): | ||
"""Exception raised when a CUDA operation runs out of memory.""" | ||
|
||
message: str = _CUDA_OOM_MESSAGE | ||
status_code: int = _CUDA_OOM_STATUS_CODE | ||
|
||
|
||
# based on https://github.com/Lightning-AI/pytorch-lightning/blob/37e04d075a5532c69b8ac7457795b4345cca30cc/src/lightning/pytorch/utilities/memory.py#L49 | ||
def _is_cuda_oom_exception(exception: BaseException) -> bool: | ||
return _is_cuda_out_of_memory(exception) or _is_cudnn_snafu(exception) | ||
|
||
|
||
# based on https://github.com/BlackHC/toma/blob/master/toma/torch_cuda_memory.py | ||
def _is_cuda_out_of_memory(exception: BaseException) -> bool: | ||
return ( | ||
isinstance(exception, RuntimeError) | ||
and len(exception.args) == 1 | ||
and "CUDA" in exception.args[0] | ||
and "out of memory" in exception.args[0] | ||
) | ||
|
||
|
||
# based on https://github.com/BlackHC/toma/blob/master/toma/torch_cuda_memory.py | ||
def _is_cudnn_snafu(exception: BaseException) -> bool: | ||
# For/because of https://github.com/pytorch/pytorch/issues/4107 | ||
return ( | ||
isinstance(exception, RuntimeError) | ||
and len(exception.args) == 1 | ||
and "cuDNN error: CUDNN_STATUS_NOT_SUPPORTED." in exception.args[0] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters