Implement experimental AWS SageMaker support (#1968)

* Create SageMaker runner and submit jobs to pipeline * Suddenly working? * Pretty-print * Update Dockerfile * Handle training steps, processing steps differently * Successful launch spot training job * Enable GPU jobs * Use Estimator * No need for temporary scripts * GPU and dependencies for splits * update RV version in requirements * use pipeline_run_name in job name * add use_spot_instances to config template and fix formatting * remove comment * fix docstring * refactor * rename "sagemaker" to "aws_sagemaker" for consistency with "aws_batch" * add use_spot_instances to rv config schema * refactor * add to API docs * update docs * add unit tests * replace ScriptProcessor with Processor * shorten default pipeline run name * "inst" --> "instance" * bump sagemaker version --------- Co-authored-by: James McClain <[email protected]> Co-authored-by: Adeel Hassan <[email protected]>
azavea · Nov 1, 2023 · e56f93d · e56f93d
1 parent 7106652
commit e56f93d
Show file tree

Hide file tree

Showing 13 changed files with 384 additions and 15 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -120,8 +120,9 @@ COPY ./rastervision_aws_s3/requirements.txt /opt/src/s3-requirements.txt
 COPY ./rastervision_core/requirements.txt /opt/src/core-requirements.txt
 COPY ./rastervision_gdal_vsi/requirements.txt /opt/src/gdal-requirements.txt
 COPY ./rastervision_pipeline/requirements.txt /opt/src/pipeline-requirements.txt
+COPY ./rastervision_aws_sagemaker/requirements.txt /opt/src/sagemaker-requirements.txt
 COPY ./requirements-dev.txt /opt/src/requirements-dev.txt
-RUN --mount=type=cache,target=/root/.cache/pip cat batch-requirements.txt s3-requirements.txt core-requirements.txt gdal-requirements.txt pipeline-requirements.txt requirements-dev.txt | sort | uniq > all-requirements.txt && \
+RUN --mount=type=cache,target=/root/.cache/pip cat batch-requirements.txt s3-requirements.txt core-requirements.txt gdal-requirements.txt pipeline-requirements.txt sagemaker-requirements.txt requirements-dev.txt | sort | uniq > all-requirements.txt && \
     pip install $(grep -ivE "^\s*$|^#|rastervision_*" all-requirements.txt) && \
     rm all-requirements.txt
 
@@ -140,13 +141,14 @@ RUN --mount=type=cache,target=/root/.cache/pip pip install -r docs/pandoc-requir
 #------------------------------------------------------------------------
 
 ENV PYTHONPATH=/opt/src:$PYTHONPATH
-ENV PYTHONPATH=/opt/src/rastervision_pipeline/:$PYTHONPATH
-ENV PYTHONPATH=/opt/src/rastervision_aws_s3/:$PYTHONPATH
 ENV PYTHONPATH=/opt/src/rastervision_aws_batch/:$PYTHONPATH
-ENV PYTHONPATH=/opt/src/rastervision_gdal_vsi/:$PYTHONPATH
+ENV PYTHONPATH=/opt/src/rastervision_aws_s3/:$PYTHONPATH
 ENV PYTHONPATH=/opt/src/rastervision_core/:$PYTHONPATH
-ENV PYTHONPATH=/opt/src/rastervision_pytorch_learner/:$PYTHONPATH
+ENV PYTHONPATH=/opt/src/rastervision_gdal_vsi/:$PYTHONPATH
+ENV PYTHONPATH=/opt/src/rastervision_pipeline/:$PYTHONPATH
+ENV PYTHONPATH=/opt/src/rastervision_aws_sagemaker/:$PYTHONPATH
 ENV PYTHONPATH=/opt/src/rastervision_pytorch_backend/:$PYTHONPATH
+ENV PYTHONPATH=/opt/src/rastervision_pytorch_learner/:$PYTHONPATH
 
 COPY scripts /opt/src/scripts/
 COPY scripts/rastervision /usr/local/bin/rastervision
@@ -155,12 +157,13 @@ COPY integration_tests /opt/src/integration_tests/
 COPY .flake8 /opt/src/.flake8
 COPY .coveragerc /opt/src/.coveragerc
 
-COPY ./rastervision_pipeline/ /opt/src/rastervision_pipeline/
-COPY ./rastervision_aws_s3/ /opt/src/rastervision_aws_s3/
 COPY ./rastervision_aws_batch/ /opt/src/rastervision_aws_batch/
+COPY ./rastervision_aws_s3/ /opt/src/rastervision_aws_s3/
 COPY ./rastervision_core/ /opt/src/rastervision_core/
-COPY ./rastervision_pytorch_learner/ /opt/src/rastervision_pytorch_learner/
-COPY ./rastervision_pytorch_backend/ /opt/src/rastervision_pytorch_backend/
 COPY ./rastervision_gdal_vsi/ /opt/src/rastervision_gdal_vsi/
+COPY ./rastervision_pipeline/ /opt/src/rastervision_pipeline/
+COPY ./rastervision_aws_sagemaker/ /opt/src/rastervision_aws_sagemaker/
+COPY ./rastervision_pytorch_backend/ /opt/src/rastervision_pytorch_backend/
+COPY ./rastervision_pytorch_learner/ /opt/src/rastervision_pytorch_learner/
 
 CMD ["bash"]
diff --git a/docs/api_reference/index.rst b/docs/api_reference/index.rst
@@ -15,5 +15,6 @@ API Reference
    rastervision.pytorch_backend
    rastervision.aws_s3
    rastervision.aws_batch
+   rastervision.aws_sagemaker
    rastervision.gdal_vsi
 
diff --git a/docs/conf.py b/docs/conf.py
@@ -204,6 +204,10 @@ def setup(app: 'Sphinx') -> None:
         'https://onnx.ai/onnx/',
         'https://onnx.ai/onnx/objects.inv',
     ),
+    'sagemaker': (
+        'https://sagemaker.readthedocs.io/en/stable/',
+        'https://sagemaker.readthedocs.io/en/stable/objects.inv',
+    ),
 }
 
 #########################

diff --git a/docs/framework/runners.rst b/docs/framework/runners.rst
@@ -17,8 +17,8 @@ local
 A ``rastervision run local ...`` command will use the ``LocalRunner``, which
 builds a ``Makefile`` based on the pipeline and executes it on the host machine. This will run multiple pipelines in parallel, as well as splittable commands in parallel, by spawning new processes for each command, where each process runs ``rastervision run_command ...``.
 
-inprocess
-^^^^^^^^^^
+``inprocess``
+^^^^^^^^^^^^^
 
 For debugging purposes, using ``rastervision run inprocess`` will run everything sequentially within a single process.
 
@@ -27,8 +27,8 @@ For debugging purposes, using ``rastervision run inprocess`` will run everything
 Running remotely
 -----------------
 
-batch
-^^^^^^
+``batch``
+^^^^^^^^^
 
 Running ``rastervision run batch ...`` will submit a DAG (directed acyclic graph) of jobs to be run on AWS Batch, which will increase the instance count to meet the workload with low-cost spot instances, and terminate the instances when the queue of commands is finished. It can also run some commands on CPU instances (like ``chip``), and others on GPU (like ``train``), and will run multiple experiments in parallel, as well as splittable commands in parallel.
 
@@ -43,6 +43,13 @@ If you are running on AWS Batch or any other remote runner, you will not be able
 .. note::
    To run on AWS Batch, you'll need the proper setup. See :ref:`aws batch setup` for instructions.
 
+.. _aws sagemaker:
+
+``sagemaker``
+^^^^^^^^^^^^^
+
+Running ``rastervision run sagemaker ...`` will submit a DAG (directed acyclic graph) of jobs represented as a SageMaker Pipeline to be run on AWS SageMaker.
+
 .. _parallelizing commands:
 
 Running Commands in Parallel

diff --git a/docs/setup/gpu.rst b/docs/setup/gpu.rst
@@ -77,3 +77,29 @@ AWS Batch is a service that makes it easier to run Dockerized computation pipeli
 
 .. seealso::
    For more information about how Raster Vision uses AWS Batch, see the section: :ref:`aws batch`.
+
+
+.. _aws sagemaker setup:
+
+Running on AWS SageMaker
+------------------------
+
+.. code:: ini
+
+    [SAGEMAKER]
+    exec_role=AmazonSageMakerExecutionRole
+    cpu_image=123.dkr.ecr.us-east-1.amazonaws.com/raster-vision
+    cpu_instance_type=ml.p3.2xlarge
+    gpu_image=123.dkr.ecr.us-east-1.amazonaws.com/raster-vision
+    gpu_instance_type=ml.p3.2xlarge
+    use_spot_instances=yes
+
+* ``exec_role`` - Execution role.
+* ``cpu_image`` - Docker image URI for CPU jobs.
+* ``cpu_instance_type`` - Instance type for CPU jobs.
+* ``gpu_image`` - Docker image URI for GPU jobs.
+* ``gpu_instance_type`` - Instance type for GPU jobs.
+* ``use_spot_instances`` - Whether to use spot instances.
+
+.. seealso::
+   For more information about how Raster Vision uses AWS Batch, see the section: :ref:`aws sagemaker`.
diff --git a/docs/setup/index.rst b/docs/setup/index.rst
@@ -68,7 +68,21 @@ is equivalent to running the following sequence of commands:
     > pip install rastervision_pytorch_learner=={{ version }}
     > pip install rastervision_pytorch_backend=={{ version }}
 
-Another optional plugin that is available, but not installed by default,  is :mod:`rastervision.gdal_vsi`.
+Extra plugins
+^^^^^^^^^^^^^
+
+:mod:`rastervision.aws_sagemaker`
+"""""""""""""""""""""""""""""""""
+This plugin adds the :class:`.AWSSageMakerRunner`, allowing you to run Raster Vision jobs on AWS SageMaker via ``rastervision run sagemaker ...``. To install:
+
+.. code-block:: console
+
+    > pip install rastervision_aws_sagemaker=={{ version }}
+
+:mod:`rastervision.gdal_vsi`
+""""""""""""""""""""""""""""
+
+This plugin adds a new :class:`.FileSystem`, the :class:`.VsiFileSystem`, which allows the use of GDAL for IO. To install:
 
 .. code-block:: console
 
@@ -87,7 +101,7 @@ Docker Images
 
 Using the Docker images published for Raster Vision makes it easy to use a fully set up environment. We have tested this with Docker 20, although you may be able to use a lower version.
 
-The images we publish include plugins and dependencies for using Raster Vision with PyTorch, AWS S3, and Batch. These are published to `quay.io/azavea/raster-vision <https://quay.io/repository/azavea/raster-vision>`_.  To run the container for the latest release, run:
+The images we publish include all plugins and dependencies for using Raster Vision with PyTorch and AWS. These are published to `quay.io/azavea/raster-vision <https://quay.io/repository/azavea/raster-vision>`_.  To run the container for the latest release, run:
 
 .. code-block:: console
 

diff --git a/rastervision_aws_sagemaker/MANIFEST.in b/rastervision_aws_sagemaker/MANIFEST.in
@@ -0,0 +1 @@
+include requirements.txt
diff --git a/rastervision_aws_sagemaker/rastervision/aws_sagemaker/__init__.py b/rastervision_aws_sagemaker/rastervision/aws_sagemaker/__init__.py
@@ -0,0 +1,25 @@
+# flake8: noqa
+
+
+def register_plugin(registry):
+    from rastervision.aws_sagemaker.aws_sagemaker_runner import (
+        AWS_SAGEMAKER, AWSSageMakerRunner)
+    registry.set_plugin_version('rastervision.aws_sagemaker', 0)
+    registry.add_runner(AWS_SAGEMAKER, AWSSageMakerRunner)
+    registry.add_rv_config_schema(AWS_SAGEMAKER, [
+        'exec_role',
+        'cpu_image',
+        'cpu_instance_type',
+        'gpu_image',
+        'gpu_instance_type',
+        'use_spot_instances',
+    ])
+
+
+import rastervision.pipeline
+from rastervision.aws_sagemaker.aws_sagemaker_runner import *
+
+__all__ = [
+    'AWS_SAGEMAKER',
+    AWSSageMakerRunner.__name__,
+]
diff --git a/rastervision_aws_sagemaker/rastervision/aws_sagemaker/aws_sagemaker_runner.py b/rastervision_aws_sagemaker/rastervision/aws_sagemaker/aws_sagemaker_runner.py
@@ -0,0 +1,190 @@
+from typing import TYPE_CHECKING, List, Optional, Union
+import logging
+from pprint import pprint
+
+import boto3
+from rastervision.pipeline import rv_config_ as rv_config
+from rastervision.pipeline.runner import Runner
+
+from sagemaker.processing import Processor
+from sagemaker.estimator import Estimator
+from sagemaker.workflow.pipeline import Pipeline as SageMakerPipeline
+from sagemaker.workflow.pipeline_context import PipelineSession
+from sagemaker.workflow.steps import ProcessingStep, TrainingStep
+
+if TYPE_CHECKING:
+    from rastervision.pipeline.pipeline import Pipeline
+    from sagemaker.workflow.pipeline_context import _JobStepArguments
+
+log = logging.getLogger(__name__)
+
+AWS_SAGEMAKER = 'sagemaker'
+
+
+class AWSSageMakerRunner(Runner):
+    """Runs pipelines remotely using AWS SageMaker.
+
+    Requires Everett configuration of form:
+
+    .. code-block:: ini
+
+        [SAGEMAKER]
+        exec_role=
+        cpu_image=
+        cpu_instance_type=
+        gpu_image=
+        gpu_instance_type=
+        use_spot_instances=
+    """
+
+    def run(self,
+            cfg_json_uri: str,
+            pipeline: 'Pipeline',
+            commands: List[str],
+            num_splits: int = 1,
+            cmd_prefix: List[str] = [
+                'python', '-m', 'rastervision.pipeline.cli'
+            ],
+            pipeline_run_name: str = 'rv'):
+        config = rv_config.get_namespace_config(AWS_SAGEMAKER)
+        exec_role = config('exec_role')
+
+        sagemaker_pipeline = self.build_pipeline(
+            cfg_json_uri,
+            pipeline,
+            commands,
+            num_splits,
+            cmd_prefix=cmd_prefix,
+            pipeline_run_name=pipeline_run_name)
+
+        # Submit the pipeline to SageMaker
+        iam_client = boto3.client('iam')
+        role_arn = iam_client.get_role(RoleName=exec_role)['Role']['Arn']
+        sagemaker_pipeline.upsert(role_arn=role_arn)
+        execution = sagemaker_pipeline.start()
+
+        pprint(execution.describe())
+
+    def build_pipeline(self,
+                       cfg_json_uri: str,
+                       pipeline: 'Pipeline',
+                       commands: List[str],
+                       num_splits: int = 1,
+                       cmd_prefix: List[str] = [
+                           'python', '-m', 'rastervision.pipeline.cli'
+                       ],
+                       pipeline_run_name: str = 'rv') -> SageMakerPipeline:
+        """Build a SageMaker Pipeline with each command as a step within it."""
+
+        config = rv_config.get_namespace_config(AWS_SAGEMAKER)
+        exec_role = config('exec_role')
+        cpu_image = config('cpu_image')
+        cpu_instance_type = config('cpu_instance_type')
+        gpu_image = config('gpu_image')
+        gpu_instance_type = config('gpu_instance_type')
+        use_spot_instances = config('use_spot_instances').lower() == 'yes'
+        sagemaker_session = PipelineSession()
+
+        steps = []
+
+        for command in commands:
+            use_gpu = command in pipeline.gpu_commands
+            job_name = f'{pipeline_run_name}-{command}'
+
+            cmd = cmd_prefix[:]
+
+            if rv_config.get_verbosity() > 1:
+                num_vs = rv_config.get_verbosity() - 1
+                # produces a string like "-vvv..."
+                verbosity_opt_str = f'-{"v" * num_vs}'
+                cmd += [verbosity_opt_str]
+
+            cmd.extend(['run_command', cfg_json_uri, command])
+
+            if command in pipeline.split_commands and num_splits > 1:
+                # If the step can be split, then split it into parts
+                # that do not depend on each other (can run in
+                # parallel).
+                _steps = []
+                for i in range(num_splits):
+                    cmd += [
+                        '--split-ind',
+                        str(i), '--num-splits',
+                        str(num_splits)
+                    ]
+                    step = self.build_step(
+                        step_name=f'{job_name}_{i+1}of{num_splits}',
+                        cmd=cmd,
+                        role=exec_role,
+                        image_uri=gpu_image if use_gpu else cpu_image,
+                        instance_type=(gpu_instance_type
+                                       if use_gpu else cpu_instance_type),
+                        use_spot_instances=use_spot_instances,
+                        sagemaker_session=sagemaker_session,
+                        use_gpu=use_gpu)
+                    step.add_depends_on(steps)
+                    _steps.append(step)
+                steps.extend(_steps)
+            else:
+                # If the step can not be split, then submit it as-is.
+                step = self.build_step(
+                    step_name=job_name,
+                    cmd=cmd,
+                    role=exec_role,
+                    image_uri=gpu_image if use_gpu else cpu_image,
+                    instance_type=(gpu_instance_type
+                                   if use_gpu else cpu_instance_type),
+                    use_spot_instances=use_spot_instances,
+                    sagemaker_session=sagemaker_session,
+                    use_gpu=use_gpu)
+                step.add_depends_on(steps)
+                steps.append(step)
+
+        # Submit the pipeline to SageMaker
+        sagemaker_pipeline = SageMakerPipeline(
+            name=pipeline_run_name,
+            steps=steps,
+            sagemaker_session=sagemaker_session,
+        )
+        return sagemaker_pipeline
+
+    def build_step(self, step_name: str, cmd: List[str], role: str,
+                   image_uri: str, instance_type: str,
+                   use_spot_instances: bool,
+                   sagemaker_session: PipelineSession,
+                   use_gpu: bool) -> Union[TrainingStep, ProcessingStep]:
+        """Build an TrainingStep if use_gpu=True, otherwise a ProcessingStep.
+        """
+        if use_gpu:
+            # For GPU-enabled steps, create an "Estimator".
+            # Formally this should probably not be used for prediction in
+            # this way, but it is expedient (especially given default
+            # service quotas, and other stuff).
+            step_estimator = Estimator(
+                container_entry_point=cmd,
+                image_uri=image_uri,
+                instance_count=1,
+                instance_type=instance_type,
+                max_retry_attempts=1,
+                role=role,
+                sagemaker_session=sagemaker_session,
+                use_spot=use_spot_instances,
+            )
+            step_args: Optional['_JobStepArguments'] = step_estimator.fit(
+                wait=False)
+            step = TrainingStep(step_name, step_args=step_args)
+        else:
+            # For non-GPU-enabled steps, create a ScriptProcessor.
+            step_processor = Processor(
+                role=role,
+                image_uri=image_uri,
+                instance_count=1,
+                instance_type=instance_type,
+                sagemaker_session=sagemaker_session,
+                entrypoint=cmd,
+            )
+            step_args: Optional['_JobStepArguments'] = step_processor.run(
+                wait=False)
+            step = ProcessingStep(step_name, step_args=step_args)
+
+        return step
diff --git a/rastervision_aws_sagemaker/requirements.txt b/rastervision_aws_sagemaker/requirements.txt
@@ -0,0 +1,2 @@
+rastervision_pipeline==0.21.4-dev
+sagemaker==2.196.0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		rastervision_pipeline==0.21.4-dev
		sagemaker==2.196.0