From 5617b012484846802e51984bc7dce8c1da04fda3 Mon Sep 17 00:00:00 2001 From: ioangatop Date: Mon, 14 Oct 2024 13:05:15 +0200 Subject: [PATCH 1/2] updates --- .github/workflows/cd.yaml | 18 +- .github/workflows/release.yaml | 7 - main.py | 14 ++ pyproject.toml | 2 +- .../vision/data/datasets/segmentation/kits.py | 196 ++++++++++++++++++ 5 files changed, 212 insertions(+), 25 deletions(-) create mode 100644 main.py create mode 100644 src/eva/vision/data/datasets/segmentation/kits.py diff --git a/.github/workflows/cd.yaml b/.github/workflows/cd.yaml index 990da5b7..9d9c0be7 100644 --- a/.github/workflows/cd.yaml +++ b/.github/workflows/cd.yaml @@ -11,7 +11,7 @@ permissions: contents: write jobs: - release-pypi: + deploy-docs: runs-on: ubuntu-latest steps: - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4 @@ -28,24 +28,8 @@ jobs: run: | git config user.email "action@github.com" git config user.name "GitHub Action" - - name: Bumping version - run: | - nox -s bump -- micro - git push origin main - - name: Build artifacts - run: | - nox -s build - - name: Test Build - run: | - python -m pip install dist/*.whl - eva --version - name: Deploy Documentation run: | git fetch origin gh-pages:gh-pages nox -s docs -- deploy --update-aliases main git push origin gh-pages - - name: Publish package distributions to PyPI - run: nox -s publish -- --no-build - env: - PDM_PUBLISH_USERNAME: ${{ secrets.PYPI_USERNAME }} - PDM_PUBLISH_PASSWORD: ${{ secrets.PYPI_PASSWORD }} diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 35335747..5d71d8af 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -13,9 +13,6 @@ permissions: jobs: release-pypi: runs-on: ubuntu-latest - permissions: - id-token: write - contents: write steps: - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4 - name: Setting up PDM @@ -31,10 +28,6 @@ jobs: run: | git config user.email "action@github.com" git config user.name "GitHub Action" - - name: Bumping version - run: | - nox -s bump -- to "${{ github.ref_name }}" - git push origin main - name: Build artifacts run: | nox -s build diff --git a/main.py b/main.py new file mode 100644 index 00000000..fdbce0ab --- /dev/null +++ b/main.py @@ -0,0 +1,14 @@ +from eva.vision.data.datasets import KiTS23 + + +dataset = KiTS23(root="data/kits23", split="train", download=True) +dataset.prepare_data() +dataset.setup() +# dataset._download() + +index = 300 +image = dataset.load_image(index) +mask = dataset.load_mask(index) + +print(image) +print(mask.unique()) diff --git a/pyproject.toml b/pyproject.toml index b1fc27a1..a36af45a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "pdm.backend" [project] name = "kaiko-eva" -version = "0.1.0" +version = "0.1.2" description = "Evaluation Framework for oncology foundation models." keywords = [ "machine-learning", diff --git a/src/eva/vision/data/datasets/segmentation/kits.py b/src/eva/vision/data/datasets/segmentation/kits.py new file mode 100644 index 00000000..e462a2e7 --- /dev/null +++ b/src/eva/vision/data/datasets/segmentation/kits.py @@ -0,0 +1,196 @@ +"""KiTS23 dataset.""" + +import functools +import glob +import os +from typing import Any, Callable, Dict, List, Literal, Tuple + +import numpy as np +import numpy.typing as npt +import torch +from torchvision import tv_tensors +from urllib import request +from typing_extensions import override +from eva.core.utils.progress_bar import tqdm + +from eva.core import utils +from eva.core.data import splitting +from eva.vision.data.datasets import _utils, _validators, structs +from eva.vision.data.datasets.segmentation import base +from eva.vision.utils import io + + +class KiTS23(base.ImageSegmentation): + """KiTS23 - The 2023 Kidney and Kidney Tumor Segmentation challenge. + + Webpage: https://kits-challenge.org/kits23/ + """ + + _train_index_ranges: List[Tuple[int, int]] = [(0, 300), (400, 589)] + """Train range indices.""" + + _expected_dataset_lengths: Dict[str | None, int] = { + "train": 38686, + "test": 8760, + } + """Dataset version and split to the expected size.""" + + _sample_every_n_slices: int | None = None + """The amount of slices to sub-sample per 3D CT scan image.""" + + _license: str = "CC BY-NC-SA 4.0" + """Dataset license.""" + + def __init__( + self, + root: str, + split: Literal["train"], + download: bool = False, + transforms: Callable | None = None, + ) -> None: + """Initialize dataset. + + Args: + root: Path to the root directory of the dataset. The dataset will + be downloaded and extracted here, if it does not already exist. + split: Dataset split to use. + download: Whether to download the data for the specified split. + Note that the download will be executed only by additionally + calling the :meth:`prepare_data` method and if the data does + not yet exist on disk. + transforms: A function/transforms that takes in an image and a target + mask and returns the transformed versions of both. + """ + super().__init__(transforms=transforms) + + self._root = root + self._split = split + self._download = download + + self._indices: List[Tuple[int, int]] = [] + + @property + @override + def classes(self) -> List[str]: + return ["kidney", "tumor", "cyst"] + + @functools.cached_property + @override + def class_to_idx(self) -> Dict[str, int]: + return {label: index for index, label in enumerate(self.classes)} + + @override + def filename(self, index: int) -> str: + sample_index, _ = self._indices[index] + return self._volume_filename(sample_index) + + @override + def prepare_data(self) -> None: + if self._download: + self._download_dataset() + + @override + def configure(self) -> None: + self._indices = self._create_indices() + + @override + def validate(self) -> None: + _validators.check_dataset_integrity( + self, + length=self._expected_dataset_lengths.get(self._split, 0), + n_classes=3, + first_and_last_labels=("kidney", "cyst"), + ) + + @override + def load_image(self, index: int) -> tv_tensors.Image: + sample_index, slice_index = self._indices[index] + volume_path = self._volume_path(sample_index) + image_array = io.read_nifti(volume_path, slice_index) + return tv_tensors.Image(image_array.transpose(2, 0, 1)) + + @override + def load_mask(self, index: int) -> tv_tensors.Mask: + sample_index, slice_index = self._indices[index] + segmentation_path = self._segmentation_path(sample_index) + semantic_labels = io.read_nifti(segmentation_path, slice_index) + return tv_tensors.Mask(semantic_labels.squeeze(), dtype=torch.int64) # type: ignore[reportCallIssue] + + @override + def load_metadata(self, index: int) -> Dict[str, Any]: + _, slice_index = self._indices[index] + return {"slice_index": slice_index} + + @override + def __len__(self) -> int: + return len(self._indices) + + def _create_indices(self) -> List[Tuple[int, int]]: + """Builds the dataset indices for the specified split. + + Returns: + A list of tuples, where the first value indicates the + sample index which the second its corresponding slice + index. + """ + indices = [ + (sample_idx, slide_idx) + for sample_idx in self._get_split_indices() + for slide_idx in range(self._get_number_of_slices_per_volume(sample_idx)) + if slide_idx % (self._sample_every_n_slices or 1) == 0 + ] + return indices + + def _get_split_indices(self) -> List[int]: + """Builds the dataset indices for the specified split.""" + split_index_ranges = { + "train": self._train_index_ranges, + } + index_ranges = split_index_ranges.get(self._split) + if index_ranges is None: + raise ValueError("Invalid data split. Use 'train' or `test`.") + + return _utils.ranges_to_indices(index_ranges) + + def _get_number_of_slices_per_volume(self, sample_index: int) -> int: + """Returns the total amount of slices of a volume.""" + volume_shape = io.fetch_nifti_shape(self._volume_path(sample_index)) + return volume_shape[-1] + + def _volume_filename(self, sample_index: int) -> str: + return os.path.join(f"case_{sample_index}", "imaging.nii.gz") + + def _segmentation_filename(self, sample_index: int) -> str: + return os.path.join(f"case_{sample_index}", "segmentation.nii.gz") + + def _volume_path(self, sample_index: int) -> str: + return os.path.join(self._root, self._volume_filename(sample_index)) + + def _segmentation_path(self, sample_index: int) -> str: + return os.path.join(self._root, self._segmentation_filename(sample_index)) + + def _download_dataset(self) -> None: + """Downloads the dataset.""" + self._print_license() + for case_id in tqdm( + self._get_split_indices(), + desc=">> Downloading dataset", + leave=False, + ): + image_path, segmentation_path = self._volume_path(case_id), self._segmentation_path(case_id) + if os.path.isfile(image_path) and os.path.isfile(segmentation_path): + continue + + os.makedirs(os.path.dirname(image_path), exist_ok=True) + request.urlretrieve( + url=f"https://kits19.sfo2.digitaloceanspaces.com/master_{case_id:05d}.nii.gz", + filename=image_path, + ) + request.urlretrieve( + url=f"https://github.com/neheller/kits23/raw/refs/heads/main/dataset/case_{case_id:05d}/segmentation.nii.gz", + filename=segmentation_path, + ) + + def _print_license(self) -> None: + """Prints the dataset license.""" + print(f"Dataset license: {self._license}") From a7d226ee85e9d36362b47ac3dd636ad197f9c291 Mon Sep 17 00:00:00 2001 From: ioangatop Date: Mon, 14 Oct 2024 13:06:02 +0200 Subject: [PATCH 2/2] updates --- main.py | 14 -- .../vision/data/datasets/segmentation/kits.py | 196 ------------------ 2 files changed, 210 deletions(-) delete mode 100644 main.py delete mode 100644 src/eva/vision/data/datasets/segmentation/kits.py diff --git a/main.py b/main.py deleted file mode 100644 index fdbce0ab..00000000 --- a/main.py +++ /dev/null @@ -1,14 +0,0 @@ -from eva.vision.data.datasets import KiTS23 - - -dataset = KiTS23(root="data/kits23", split="train", download=True) -dataset.prepare_data() -dataset.setup() -# dataset._download() - -index = 300 -image = dataset.load_image(index) -mask = dataset.load_mask(index) - -print(image) -print(mask.unique()) diff --git a/src/eva/vision/data/datasets/segmentation/kits.py b/src/eva/vision/data/datasets/segmentation/kits.py deleted file mode 100644 index e462a2e7..00000000 --- a/src/eva/vision/data/datasets/segmentation/kits.py +++ /dev/null @@ -1,196 +0,0 @@ -"""KiTS23 dataset.""" - -import functools -import glob -import os -from typing import Any, Callable, Dict, List, Literal, Tuple - -import numpy as np -import numpy.typing as npt -import torch -from torchvision import tv_tensors -from urllib import request -from typing_extensions import override -from eva.core.utils.progress_bar import tqdm - -from eva.core import utils -from eva.core.data import splitting -from eva.vision.data.datasets import _utils, _validators, structs -from eva.vision.data.datasets.segmentation import base -from eva.vision.utils import io - - -class KiTS23(base.ImageSegmentation): - """KiTS23 - The 2023 Kidney and Kidney Tumor Segmentation challenge. - - Webpage: https://kits-challenge.org/kits23/ - """ - - _train_index_ranges: List[Tuple[int, int]] = [(0, 300), (400, 589)] - """Train range indices.""" - - _expected_dataset_lengths: Dict[str | None, int] = { - "train": 38686, - "test": 8760, - } - """Dataset version and split to the expected size.""" - - _sample_every_n_slices: int | None = None - """The amount of slices to sub-sample per 3D CT scan image.""" - - _license: str = "CC BY-NC-SA 4.0" - """Dataset license.""" - - def __init__( - self, - root: str, - split: Literal["train"], - download: bool = False, - transforms: Callable | None = None, - ) -> None: - """Initialize dataset. - - Args: - root: Path to the root directory of the dataset. The dataset will - be downloaded and extracted here, if it does not already exist. - split: Dataset split to use. - download: Whether to download the data for the specified split. - Note that the download will be executed only by additionally - calling the :meth:`prepare_data` method and if the data does - not yet exist on disk. - transforms: A function/transforms that takes in an image and a target - mask and returns the transformed versions of both. - """ - super().__init__(transforms=transforms) - - self._root = root - self._split = split - self._download = download - - self._indices: List[Tuple[int, int]] = [] - - @property - @override - def classes(self) -> List[str]: - return ["kidney", "tumor", "cyst"] - - @functools.cached_property - @override - def class_to_idx(self) -> Dict[str, int]: - return {label: index for index, label in enumerate(self.classes)} - - @override - def filename(self, index: int) -> str: - sample_index, _ = self._indices[index] - return self._volume_filename(sample_index) - - @override - def prepare_data(self) -> None: - if self._download: - self._download_dataset() - - @override - def configure(self) -> None: - self._indices = self._create_indices() - - @override - def validate(self) -> None: - _validators.check_dataset_integrity( - self, - length=self._expected_dataset_lengths.get(self._split, 0), - n_classes=3, - first_and_last_labels=("kidney", "cyst"), - ) - - @override - def load_image(self, index: int) -> tv_tensors.Image: - sample_index, slice_index = self._indices[index] - volume_path = self._volume_path(sample_index) - image_array = io.read_nifti(volume_path, slice_index) - return tv_tensors.Image(image_array.transpose(2, 0, 1)) - - @override - def load_mask(self, index: int) -> tv_tensors.Mask: - sample_index, slice_index = self._indices[index] - segmentation_path = self._segmentation_path(sample_index) - semantic_labels = io.read_nifti(segmentation_path, slice_index) - return tv_tensors.Mask(semantic_labels.squeeze(), dtype=torch.int64) # type: ignore[reportCallIssue] - - @override - def load_metadata(self, index: int) -> Dict[str, Any]: - _, slice_index = self._indices[index] - return {"slice_index": slice_index} - - @override - def __len__(self) -> int: - return len(self._indices) - - def _create_indices(self) -> List[Tuple[int, int]]: - """Builds the dataset indices for the specified split. - - Returns: - A list of tuples, where the first value indicates the - sample index which the second its corresponding slice - index. - """ - indices = [ - (sample_idx, slide_idx) - for sample_idx in self._get_split_indices() - for slide_idx in range(self._get_number_of_slices_per_volume(sample_idx)) - if slide_idx % (self._sample_every_n_slices or 1) == 0 - ] - return indices - - def _get_split_indices(self) -> List[int]: - """Builds the dataset indices for the specified split.""" - split_index_ranges = { - "train": self._train_index_ranges, - } - index_ranges = split_index_ranges.get(self._split) - if index_ranges is None: - raise ValueError("Invalid data split. Use 'train' or `test`.") - - return _utils.ranges_to_indices(index_ranges) - - def _get_number_of_slices_per_volume(self, sample_index: int) -> int: - """Returns the total amount of slices of a volume.""" - volume_shape = io.fetch_nifti_shape(self._volume_path(sample_index)) - return volume_shape[-1] - - def _volume_filename(self, sample_index: int) -> str: - return os.path.join(f"case_{sample_index}", "imaging.nii.gz") - - def _segmentation_filename(self, sample_index: int) -> str: - return os.path.join(f"case_{sample_index}", "segmentation.nii.gz") - - def _volume_path(self, sample_index: int) -> str: - return os.path.join(self._root, self._volume_filename(sample_index)) - - def _segmentation_path(self, sample_index: int) -> str: - return os.path.join(self._root, self._segmentation_filename(sample_index)) - - def _download_dataset(self) -> None: - """Downloads the dataset.""" - self._print_license() - for case_id in tqdm( - self._get_split_indices(), - desc=">> Downloading dataset", - leave=False, - ): - image_path, segmentation_path = self._volume_path(case_id), self._segmentation_path(case_id) - if os.path.isfile(image_path) and os.path.isfile(segmentation_path): - continue - - os.makedirs(os.path.dirname(image_path), exist_ok=True) - request.urlretrieve( - url=f"https://kits19.sfo2.digitaloceanspaces.com/master_{case_id:05d}.nii.gz", - filename=image_path, - ) - request.urlretrieve( - url=f"https://github.com/neheller/kits23/raw/refs/heads/main/dataset/case_{case_id:05d}/segmentation.nii.gz", - filename=segmentation_path, - ) - - def _print_license(self) -> None: - """Prints the dataset license.""" - print(f"Dataset license: {self._license}")