From e8d457f0e399f2927876782efb55d26a5f6b2a53 Mon Sep 17 00:00:00 2001 From: ioangatop Date: Thu, 7 Mar 2024 16:12:47 +0100 Subject: [PATCH] remove crc nonorm dataset --- .../vision/dino_vit/offline/crc_nonorm.yaml | 111 ------------- .../vision/dino_vit/online/crc_nonorm.yaml | 89 ---------- src/eva/vision/data/datasets/__init__.py | 2 - .../data/datasets/classification/__init__.py | 2 - .../datasets/classification/crc_nonorm.py | 156 ------------------ .../classification/test_crc_nonorm.py | 44 ----- 6 files changed, 404 deletions(-) delete mode 100644 configs/vision/dino_vit/offline/crc_nonorm.yaml delete mode 100644 configs/vision/dino_vit/online/crc_nonorm.yaml delete mode 100644 src/eva/vision/data/datasets/classification/crc_nonorm.py delete mode 100644 tests/eva/vision/data/datasets/classification/test_crc_nonorm.py diff --git a/configs/vision/dino_vit/offline/crc_nonorm.yaml b/configs/vision/dino_vit/offline/crc_nonorm.yaml deleted file mode 100644 index 5d9a41eb..00000000 --- a/configs/vision/dino_vit/offline/crc_nonorm.yaml +++ /dev/null @@ -1,111 +0,0 @@ ---- -trainer: - class_path: eva.Trainer - init_args: - n_runs: 5 - default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:DINO_BACKBONE, dino_vits16}/offline/crc_nonorm} - max_steps: &MAX_STEPS 12500 - callbacks: - - class_path: pytorch_lightning.callbacks.LearningRateMonitor - init_args: - logging_interval: epoch - - class_path: pytorch_lightning.callbacks.ModelCheckpoint - init_args: - filename: best - save_last: true - save_top_k: 1 - monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/MulticlassAccuracy} - mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max} - - class_path: pytorch_lightning.callbacks.EarlyStopping - init_args: - min_delta: 0 - patience: 48 - monitor: *MONITOR_METRIC - mode: *MONITOR_METRIC_MODE - - class_path: eva.callbacks.EmbeddingsWriter - init_args: - output_dir: &EMBEDDINGS_DIR ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings}/${oc.env:DINO_BACKBONE, dino_vits16}/crc_nonorm - dataloader_idx_map: - 0: train - 1: val - backbone: - class_path: eva.models.ModelFromFunction - init_args: - path: torch.hub.load - arguments: - repo_or_dir: facebookresearch/dino:main - model: ${oc.env:DINO_BACKBONE, dino_vits16} - pretrained: ${oc.env:PRETRAINED, true} - checkpoint_path: ${oc.env:CHECKPOINT_PATH, null} - logger: - - class_path: pytorch_lightning.loggers.TensorBoardLogger - init_args: - save_dir: *OUTPUT_ROOT - name: "" -model: - class_path: eva.HeadModule - init_args: - head: - class_path: torch.nn.Linear - init_args: - in_features: ${oc.env:IN_FEATURES, 384} - out_features: &NUM_CLASSES 9 - criterion: torch.nn.CrossEntropyLoss - optimizer: - class_path: torch.optim.SGD - init_args: - lr: &LR_VALUE 0.01 - momentum: 0.9 - weight_decay: 0.0 - lr_scheduler: - class_path: torch.optim.lr_scheduler.CosineAnnealingLR - init_args: - T_max: *MAX_STEPS - eta_min: 0.0 - metrics: - common: - - class_path: eva.metrics.AverageLoss - - class_path: eva.metrics.MulticlassClassificationMetrics - init_args: - num_classes: *NUM_CLASSES -data: - class_path: eva.DataModule - init_args: - datasets: - train: - class_path: eva.vision.data.datasets.embeddings.PatchEmbeddingDataset - init_args: &DATASET_ARGS - root: *EMBEDDINGS_DIR - split: train - column_mapping: - path: embedding - val: - class_path: eva.vision.data.datasets.embeddings.PatchEmbeddingDataset - init_args: - <<: *DATASET_ARGS - split: val - predict: - - class_path: eva.vision.datasets.CRC_NONORM - init_args: &PREDICT_DATASET_ARGS - root: ${oc.env:DATA_ROOT, ./data}/crc_he - split: train - download: ${oc.env:DOWNLOAD_DATA, true} - image_transforms: - class_path: eva.vision.data.transforms.common.ResizeAndCrop - init_args: - mean: [0.485, 0.456, 0.406] - std: [0.229, 0.224, 0.225] - - class_path: eva.vision.datasets.CRC_NONORM - init_args: - <<: *PREDICT_DATASET_ARGS - split: val - dataloaders: - train: - batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 4096} - shuffle: true - val: - batch_size: *BATCH_SIZE - test: - batch_size: *BATCH_SIZE - predict: - batch_size: &PREDICT_BATCH_SIZE ${oc.env:PREDICT_BATCH_SIZE, 128} diff --git a/configs/vision/dino_vit/online/crc_nonorm.yaml b/configs/vision/dino_vit/online/crc_nonorm.yaml deleted file mode 100644 index ed79e45e..00000000 --- a/configs/vision/dino_vit/online/crc_nonorm.yaml +++ /dev/null @@ -1,89 +0,0 @@ ---- -trainer: - class_path: eva.Trainer - init_args: - default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:DINO_BACKBONE, dino_vits16}/online/crc_nonorm} - max_steps: &MAX_STEPS 12500 - callbacks: - - class_path: pytorch_lightning.callbacks.LearningRateMonitor - init_args: - logging_interval: epoch - - class_path: pytorch_lightning.callbacks.ModelCheckpoint - init_args: - filename: best - save_last: true - save_top_k: 1 - monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/MulticlassAccuracy} - mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max} - - class_path: pytorch_lightning.callbacks.EarlyStopping - init_args: - min_delta: 0 - patience: 48 - monitor: *MONITOR_METRIC - mode: *MONITOR_METRIC_MODE - logger: - - class_path: pytorch_lightning.loggers.TensorBoardLogger - init_args: - save_dir: *OUTPUT_ROOT - name: "" -model: - class_path: eva.HeadModule - init_args: - backbone: - class_path: eva.models.ModelFromFunction - init_args: - path: torch.hub.load - arguments: - repo_or_dir: facebookresearch/dino:main - model: ${oc.env:DINO_BACKBONE, dino_vits16} - pretrained: ${oc.env:PRETRAINED, true} - checkpoint_path: &CHECKPOINT_PATH ${oc.env:CHECKPOINT_PATH, null} - head: - class_path: torch.nn.Linear - init_args: - in_features: ${oc.env:IN_FEATURES, 384} - out_features: &NUM_CLASSES 9 - criterion: torch.nn.CrossEntropyLoss - optimizer: - class_path: torch.optim.SGD - init_args: - lr: &LR_VALUE 0.00064 - momentum: 0.9 - weight_decay: 0.0 - lr_scheduler: - class_path: torch.optim.lr_scheduler.CosineAnnealingLR - init_args: - T_max: *MAX_STEPS - eta_min: 0.0 - metrics: - common: - - class_path: eva.metrics.AverageLoss - - class_path: eva.metrics.MulticlassClassificationMetrics - init_args: - num_classes: *NUM_CLASSES -data: - class_path: eva.DataModule - init_args: - datasets: - train: - class_path: eva.vision.datasets.CRC_NONORM - init_args: &DATASET_ARGS - root: ${oc.env:DATA_ROOT, ./data}/crc - split: train - download: ${oc.env:DOWNLOAD_DATA, true} - image_transforms: - class_path: eva.vision.data.transforms.common.ResizeAndCrop - init_args: - mean: [0.485, 0.456, 0.406] - std: [0.229, 0.224, 0.225] - val: - class_path: eva.vision.datasets.CRC_NONORM - init_args: - <<: *DATASET_ARGS - split: val - dataloaders: - train: - batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 256} - shuffle: true - val: - batch_size: *BATCH_SIZE diff --git a/src/eva/vision/data/datasets/__init__.py b/src/eva/vision/data/datasets/__init__.py index 86601366..02a40059 100644 --- a/src/eva/vision/data/datasets/__init__.py +++ b/src/eva/vision/data/datasets/__init__.py @@ -3,7 +3,6 @@ from eva.vision.data.datasets.classification import ( BACH, CRC, - CRC_NONORM, PatchCamelyon, TotalSegmentatorClassification, ) @@ -14,7 +13,6 @@ __all__ = [ "BACH", "CRC", - "CRC_NONORM", "PatchEmbeddingDataset", "ImageSegmentation", "SlideEmbeddingDataset", diff --git a/src/eva/vision/data/datasets/classification/__init__.py b/src/eva/vision/data/datasets/classification/__init__.py index 127ddf51..557e15e0 100644 --- a/src/eva/vision/data/datasets/classification/__init__.py +++ b/src/eva/vision/data/datasets/classification/__init__.py @@ -2,14 +2,12 @@ from eva.vision.data.datasets.classification.bach import BACH from eva.vision.data.datasets.classification.crc import CRC -from eva.vision.data.datasets.classification.crc_nonorm import CRC_NONORM from eva.vision.data.datasets.classification.patch_camelyon import PatchCamelyon from eva.vision.data.datasets.classification.total_segmentator import TotalSegmentatorClassification __all__ = [ "BACH", "CRC", - "CRC_NONORM", "PatchCamelyon", "TotalSegmentatorClassification", ] diff --git a/src/eva/vision/data/datasets/classification/crc_nonorm.py b/src/eva/vision/data/datasets/classification/crc_nonorm.py deleted file mode 100644 index 14345e5a..00000000 --- a/src/eva/vision/data/datasets/classification/crc_nonorm.py +++ /dev/null @@ -1,156 +0,0 @@ -"""CRC-NONORM dataset class.""" - -import os -from typing import Callable, Dict, List, Literal, Tuple - -import numpy as np -from torchvision.datasets import folder, utils -from typing_extensions import override - -from eva.vision.data.datasets import structs -from eva.vision.data.datasets.classification import base -from eva.vision.utils import io - - -class CRC_NONORM(base.ImageClassification): - """Dataset class for CRC-NONORM images and corresponding targets.""" - - _train_resource: structs.DownloadResource = structs.DownloadResource( - filename="NCT-CRC-HE-100K-NONORM.zip", - url="https://zenodo.org/records/1214456/files/NCT-CRC-HE-100K-NONORM.zip?download=1", - md5="md5:035777cf327776a71a05c95da6d6325f", - ) - """Train resource.""" - - _val_resource: structs.DownloadResource = structs.DownloadResource( - filename="CRC-VAL-HE-7K.zip", - url="https://zenodo.org/records/1214456/files/NCT-CRC-HE-7K.zip?download=1", - md5="md5:2fd1651b4f94ebd818ebf90ad2b6ce06", - ) - """Validation resource.""" - - def __init__( - self, - root: str, - split: Literal["train", "val"], - download: bool = False, - image_transforms: Callable | None = None, - target_transforms: Callable | None = None, - ) -> None: - """Initializes the dataset. - - The dataset is split into a train (train) and validation (val) set: - - train: This is a slightly different version of the "NCT-CRC-HE-100K" image set: - This set contains 100,000 images in 9 tissue classes at 0.5 MPP and was created - from the same raw data as "NCT-CRC-HE-100K". However, no color normalization was - applied to these images. Consequently, staining intensity and color slightly - varies between the images. - - val: A set of 7180 image patches from N=50 patients with colorectal adenocarcinoma - (no overlap with patients in NCT-CRC-HE-100K). - - Args: - root: Path to the root directory of the dataset. - split: Dataset split to use. - download: Whether to download the data for the specified split. - Note that the download will be executed only by additionally - calling the :meth:`prepare_data` method and if the data does - not yet exist on disk. - image_transforms: A function/transform that takes in an image - and returns a transformed version. - target_transforms: A function/transform that takes in the target - and transforms it. - """ - super().__init__( - image_transforms=image_transforms, - target_transforms=target_transforms, - ) - - self._root = root - self._split = split - self._download = download - - self._samples: List[Tuple[str, int]] = [] - - @property - @override - def classes(self) -> List[str]: - return ["ADI", "BACK", "DEB", "LYM", "MUC", "MUS", "NORM", "STR", "TUM"] - - @property - @override - def class_to_idx(self) -> Dict[str, int]: - return { - "ADI": 0, - "BACK": 1, - "DEB": 2, - "LYM": 3, - "MUC": 4, - "MUS": 5, - "NORM": 6, - "STR": 7, - "TUM": 8, - } - - @override - def filename(self, index: int) -> str: - image_path, *_ = self._samples[index] - return os.path.relpath(image_path, self._dataset_dir) - - @override - def prepare_data(self) -> None: - if self._download: - self._download_dataset() - - @override - def setup(self) -> None: - self._samples = self._make_dataset() - - @override - def load_image(self, index: int) -> np.ndarray: - image_path, _ = self._samples[index] - return io.read_image(image_path) - - @override - def load_target(self, index: int) -> np.ndarray: - _, target = self._samples[index] - return np.asarray(target, dtype=np.int64) - - @override - def __len__(self) -> int: - return len(self._samples) - - @property - def _dataset_dir(self) -> str: - """Returns the full path of dataset directory.""" - dataset_dirs = { - "train": os.path.join(self._root, "NCT-CRC-HE-100K-NONORM"), - "val": os.path.join(self._root, "CRC-VAL-HE-7K"), - } - dataset_dir = dataset_dirs.get(self._split) - if dataset_dir is None: - raise ValueError("Invalid data split. Use 'train' or 'val'.") - - return dataset_dir - - def _make_dataset(self) -> List[Tuple[str, int]]: - """Builds the dataset for the specified split.""" - dataset = folder.make_dataset( - directory=self._dataset_dir, - class_to_idx=self.class_to_idx, - extensions=(".tif"), - ) - return dataset - - def _download_dataset(self) -> None: - """Downloads the dataset resources.""" - for resource in [self._train_resource, self._val_resource]: - resource_dir = resource.filename.rsplit(".", maxsplit=1)[0] - if os.path.isdir(os.path.join(self._root, resource_dir)): - continue - - utils.download_and_extract_archive( - resource.url, - download_root=self._root, - filename=resource.filename, - remove_finished=True, - ) diff --git a/tests/eva/vision/data/datasets/classification/test_crc_nonorm.py b/tests/eva/vision/data/datasets/classification/test_crc_nonorm.py deleted file mode 100644 index 9f585388..00000000 --- a/tests/eva/vision/data/datasets/classification/test_crc_nonorm.py +++ /dev/null @@ -1,44 +0,0 @@ -"""CRC_NONORM dataset tests.""" - -import os -from typing import Literal - -import numpy as np -import pytest - -from eva.vision.data import datasets - - -@pytest.mark.parametrize( - "split, index", - [ - ("train", 0), - ("train", 2), - ("val", 0), - ("val", 2), - ], -) -def test_sample(crc_nonorm_dataset: datasets.CRC_NONORM, index: int) -> None: - """Tests the format of a dataset sample.""" - # assert data sample is a tuple - sample = crc_nonorm_dataset[index] - assert isinstance(sample, tuple) - assert len(sample) == 2 - # assert the format of the `image` and `target` - image, target = sample - assert isinstance(image, np.ndarray) - assert image.shape == (16, 16, 3) - assert isinstance(target, np.ndarray) - assert target in [0, 1, 2, 3, 4, 5, 6, 7, 8] - - -@pytest.fixture(scope="function") -def crc_nonorm_dataset(split: Literal["train", "val"], assets_path: str) -> datasets.CRC_NONORM: - """CRC_HE_NONORM dataset fixture.""" - dataset = datasets.CRC_NONORM( - root=os.path.join(assets_path, "vision", "datasets", "crc"), - split=split, - ) - dataset.prepare_data() - dataset.setup() - return dataset