From 6bcbb11b9cdae59516d1ccaae7ee9698b6c49717 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Kozlovsk=C3=BD?= Date: Tue, 8 Oct 2024 23:56:37 +0200 Subject: [PATCH] LuxonisParser - RoboFlow URL Support (#189) --- .github/workflows/ci.yaml | 3 + luxonis_ml/data/parsers/luxonis_parser.py | 68 +++++++++++++++++++++-- luxonis_ml/data/requirements.txt | 1 + luxonis_ml/utils/environ.py | 2 + luxonis_ml/utils/filesystem.py | 21 +++---- tests/test_data/test_parsers.py | 14 ++++- 6 files changed, 92 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 660b09bc..67aea301 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -41,6 +41,9 @@ jobs: with: ref: ${{ github.head_ref }} + - name: Install pre-commit + run: python3 -m pip install 'pre-commit<4.0.0' + - name: Run pre-commit uses: pre-commit/action@v3.0.1 diff --git a/luxonis_ml/data/parsers/luxonis_parser.py b/luxonis_ml/data/parsers/luxonis_parser.py index 1a08e0fc..fa8abb26 100644 --- a/luxonis_ml/data/parsers/luxonis_parser.py +++ b/luxonis_ml/data/parsers/luxonis_parser.py @@ -1,6 +1,7 @@ import logging import zipfile from enum import Enum +from importlib.util import find_spec from pathlib import Path from typing import ( Dict, @@ -16,7 +17,8 @@ from luxonis_ml.data import DATASETS_REGISTRY, BaseDataset, LuxonisDataset from luxonis_ml.data.utils.enums import LabelType from luxonis_ml.enums import DatasetType -from luxonis_ml.utils import LuxonisFileSystem +from luxonis_ml.utils import LuxonisFileSystem, environ +from luxonis_ml.utils.filesystem import _pip_install from .base_parser import BaseParser from .classification_directory_parser import ClassificationDirectoryParser @@ -72,8 +74,15 @@ def __init__( appropriate parser. @type dataset_dir: str - @param dataset_dir: Path to the dataset directory or zip file. - Can also be a remote URL supported by L{LuxonisFileSystem}. + @param dataset_dir: Identifier of the dataset directory. + Can be one of: + - Local path to the dataset directory. + - Remote URL supported by L{LuxonisFileSystem}. + - C{gcs://} for Google Cloud Storage + - C{s3://} for Amazon S3 + - C{roboflow://} for Roboflow datasets. + - Expected format: C{roboflow://workspace/project/version/format}. + Can be a remote URL supported by L{LuxonisFileSystem}. @type dataset_name: Optional[str] @param dataset_name: Name of the dataset. If C{None}, the name is derived from the name of the dataset directory. @@ -97,9 +106,16 @@ def __init__( names. """ save_dir = Path(save_dir) if save_dir else None - name = Path(dataset_dir).name - local_path = (save_dir or Path.cwd()) / name - self.dataset_dir = LuxonisFileSystem.download(dataset_dir, local_path) + if dataset_dir.startswith("roboflow://"): + self.dataset_dir, name = self._download_roboflow_dataset( + dataset_dir, save_dir + ) + else: + name = dataset_dir.split("/")[-1] + local_path = (save_dir or Path.cwd()) / name + self.dataset_dir = LuxonisFileSystem.download( + dataset_dir, local_path + ) if self.dataset_dir.suffix == ".zip": with zipfile.ZipFile(self.dataset_dir, "r") as zip_ref: unzip_dir = self.dataset_dir.parent / self.dataset_dir.stem @@ -237,3 +253,43 @@ def _parse_split( return self.parser.parse_split( split, random_split, split_ratios, **parsed_kwargs, **kwargs ) + + def _download_roboflow_dataset( + self, dataset_dir: str, local_path: Optional[Path] + ) -> Tuple[Path, str]: + if find_spec("roboflow") is None: + _pip_install("roboflow", "roboflow", "0.1.1") + + from roboflow import Roboflow + + if environ.ROBOFLOW_API_KEY is None: + raise RuntimeError( + "ROBOFLOW_API_KEY environment variable is not set. " + "Please set it to your Roboflow API key." + ) + + rf = Roboflow(api_key=environ.ROBOFLOW_API_KEY) + parts = dataset_dir.split("roboflow://")[1].split("/") + if len(parts) != 4: + raise ValueError( + f"Incorrect Roboflow dataset URL: `{dataset_dir}`. " + "Expected format: `roboflow://workspace/project/version/format`." + ) + workspace, project, version, format = dataset_dir.split("roboflow://")[ + 1 + ].split("/") + try: + version = int(version) + except ValueError as e: + raise ValueError( + f"Roboflow version must be an integer, got `{version}`." + ) from e + + local_path = local_path or Path.cwd() / f"{project}_{format}" + dataset = ( + rf.workspace(workspace) + .project(project) + .version(int(version)) + .download(format, str(local_path / project)) + ) + return Path(dataset.location), project diff --git a/luxonis_ml/data/requirements.txt b/luxonis_ml/data/requirements.txt index 5e787cfc..2a6b5927 100644 --- a/luxonis_ml/data/requirements.txt +++ b/luxonis_ml/data/requirements.txt @@ -12,3 +12,4 @@ pycocotools>=2.0.7 typeguard>=4.1.0 polars[timezone]>=0.20.31 ordered-set>=4.0.0 +# roboflow>=0.1.1 diff --git a/luxonis_ml/utils/environ.py b/luxonis_ml/utils/environ.py index 342347b0..6ff89728 100644 --- a/luxonis_ml/utils/environ.py +++ b/luxonis_ml/utils/environ.py @@ -34,6 +34,8 @@ class Environ(BaseSettings): LUXONISML_BASE_PATH: Path = Path.home() / "luxonis_ml" LUXONISML_TEAM_ID: str = "offline" + ROBOFLOW_API_KEY: Optional[str] = None + GOOGLE_APPLICATION_CREDENTIALS: Optional[str] = None LOG_LEVEL: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = ( diff --git a/luxonis_ml/utils/filesystem.py b/luxonis_ml/utils/filesystem.py index e9a2e642..4c95cbbd 100644 --- a/luxonis_ml/utils/filesystem.py +++ b/luxonis_ml/utils/filesystem.py @@ -674,19 +674,12 @@ def upload(local_path: PathType, url: str) -> None: def _check_package_installed(protocol: str) -> None: # pragma: no cover - def _pip_install(package: str, version: str) -> None: - logger.error(f"{package} is necessary for {protocol} protocol.") - logger.info(f"Installing {package}...") - subprocess.run( - [sys.executable, "-m", "pip", "install", f"{package}>={version}"] - ) - if protocol in ["gs", "gcs"] and find_spec("gcsfs") is None: - _pip_install("gcsfs", "2023.3.0") + _pip_install(protocol, "gcsfs", "2023.3.0") elif protocol == "s3" and find_spec("s3fs") is None: - _pip_install("s3fs", "2023.3.0") + _pip_install(protocol, "s3fs", "2023.3.0") elif protocol == "mlflow" and find_spec("mlflow") is None: - _pip_install("mlflow", "2.10.0") + _pip_install(protocol, "mlflow", "2.10.0") def _get_protocol_and_path(path: str) -> Tuple[str, Optional[str]]: @@ -702,3 +695,11 @@ def _get_protocol_and_path(path: str) -> Tuple[str, Optional[str]]: protocol = "file" return protocol, path if path else None + + +def _pip_install(protocol: str, package: str, version: str) -> None: + logger.error(f"'{package}' is necessary for '{protocol}://' protocol.") + logger.info(f"Installing {package}...") + subprocess.run( + [sys.executable, "-m", "pip", "install", f"{package}>={version}"] + ) diff --git a/tests/test_data/test_parsers.py b/tests/test_data/test_parsers.py index 405fb6c0..6acc59b4 100644 --- a/tests/test_data/test_parsers.py +++ b/tests/test_data/test_parsers.py @@ -4,6 +4,7 @@ from luxonis_ml.data import LabelType, LuxonisLoader, LuxonisParser from luxonis_ml.enums import DatasetType +from luxonis_ml.utils import environ URL_PREFIX: Final[str] = "gs://luxonis-test-bucket/luxonis-ml-test-data" WORK_DIR: Final[str] = "tests/data/parser_datasets" @@ -82,13 +83,24 @@ def prepare_dir(): "D1_ParkingSlot-solo.zip", [LabelType.BOUNDINGBOX, LabelType.SEGMENTATION], ), + ( + DatasetType.COCO, + "roboflow://team-roboflow/coco-128/2/coco", + [LabelType.BOUNDINGBOX, LabelType.CLASSIFICATION], + ), ], ) def test_dir_parser( dataset_type: DatasetType, url: str, expected_label_types: List[LabelType] ): + if not url.startswith("roboflow://"): + url = f"{URL_PREFIX}/{url}" + + elif environ.ROBOFLOW_API_KEY is None: + pytest.skip("Roboflow API key is not set") + parser = LuxonisParser( - f"{URL_PREFIX}/{url}", + url, dataset_name=f"test-{dataset_type}", delete_existing=True, save_dir=WORK_DIR,