Secure Code Eval changes (mosaicml#2679)

* fix code eval * do not set default * add test * add errors * fix params * fix test * skip * upload * fix test * comment
mvpatel2000 · Oct 31, 2023 · d23b349 · d23b349
1 parent 0e521a5
commit d23b349
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 11 deletions.
diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py
@@ -528,16 +528,16 @@ def __init__(self, dist_sync_on_step: bool = False):
         self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
 
-        self.eval_device = 'LAMBDA'
         if not 'CODE_EVAL_DEVICE' in os.environ:
-            if 'MOSAICML_PLATFORM' in os.environ:
-                log.info('Defaulting to MOSAICML evaluation on the MosaicML Platform')
-                self.eval_device = 'MOSAICML'
-            else:
-                log.info(f"'CODE_EVAL_DEVICE' env var was not set, so defaulting to 'LAMBDA' as eval device")
-                os.environ['CODE_EVAL_DEVICE'] = 'LAMBDA'
-        else:
-            self.eval_device = os.environ['CODE_EVAL_DEVICE'].upper()
+            raise ValueError(
+                'Attempting to use InContextLearningCodeEvalAccuracy but environment '
+                'variable `CODE_EVAL_DEVICE` is not set. Please set it to `CODE_EVAL_DEVICE` '
+                'to one of `LOCAL` (for unsafe local eval), `LAMBDA` (for AWS lambda ',
+                'evaluation), or `MOSAICML` (for lambda eval through MAPI).')
+        self.eval_device = os.environ['CODE_EVAL_DEVICE'].upper()
+        if self.eval_device not in ('LOCAL', 'LAMBDA', 'MOSAICML'):
+            raise ValueError('Environment variable `CODE_EVAL_DEVICE` must be one of `LOCAL`, '
+                             '`LAMBDA`, or `MOSAICML`.')
 
     def get_client(self) -> EvalClient:
         """Returns a client for the appropriate remote platform."""
@@ -554,9 +554,9 @@ def get_client(self) -> EvalClient:
         elif self.eval_device == 'MOSAICML':
             client = MosaicMLLambdaEvalClient()
         else:
-            raise Exception(
+            raise ValueError(
                 'Remote platforms apart from Lambdas/MOSAICML are not yet supported. Please set environment variable '
-                'CODE_EVAL_DEVICE to LOCAL or LAMBDA, or run on the MosaicML Platform.')
+                '`CODE_EVAL_DEVICE` to `LOCAL`, `LAMBDA`, or `MOSAICML`.')
         return client
 
     def estimator(self, n: int, c: int, k: int) -> float:

diff --git a/composer/utils/eval_client/mosaicml_lambda_eval_client.py b/composer/utils/eval_client/mosaicml_lambda_eval_client.py
@@ -5,6 +5,7 @@
 import logging
 import os
 import time
+from http import HTTPStatus
 from typing import Dict, List
 
 import mcli
@@ -53,6 +54,9 @@ def invoke(self, payload: List[List[List[Dict[str, str]]]]) -> List[List[List[bo
                         log.error(f'Failed to get code eval output after {self.num_retries} retries. Error: {e}')
                     log.warning(f'Failed to get code eval output, retrying in {self.backoff**i} seconds.')
                     time.sleep(self.backoff**i)
+                elif e.status == HTTPStatus.UNAUTHORIZED:
+                    raise RuntimeError('Failed to get code eval output due to UNAUTHORIZED error. '
+                                       'Please ensure you have access to MosaicMLLambdaEvalClient.') from e
                 else:
                     log.error(f'Failed to get code eval output with unexpected MAPIException. Error: {e}')
                     break

diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
@@ -1343,6 +1343,12 @@ def test_qa_task_with_cot_evaluation(device, world_size, num_fewshot, dataset_ur
     assert in_memory_logger.data['metrics/gsm8k/InContextLearningQAAccuracy'][0][1].item() == 0
 
 
+@pytest.mark.gpu  # Run on MosaicML platform
+def test_code_eval_requires_envvar():
+    with pytest.raises(ValueError, match='Attempting to use InContextLearningCodeEvalAccuracy but.*'):
+        InContextLearningCodeEvalAccuracy()
+
+
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 @device('gpu')
 @world_size(1, 2)