diff --git a/admin/setup_workers.sh b/admin/setup_workers.sh index c0afbfe7..ddb00fb2 100755 --- a/admin/setup_workers.sh +++ b/admin/setup_workers.sh @@ -4,11 +4,12 @@ set -e source ./config.sh +# Max upload size is 100 MiB gcloud compute --project "${GCLOUD_PROJECT}" \ instance-templates create "worker-instance-template" \ --machine-type "${MACHINE_TYPE}" \ --network "default" \ - --metadata "^#&&#^halite-manager-url=${COORDINATOR_URL}#&&#halite-secret-folder=${SECRET_FOLDER}#&&#startup-script=$(cat setup_workers__startup_script.sh)" \ + --metadata "^#&&#^halite-manager-url=${COORDINATOR_URL}#&&#halite-secret-folder=${SECRET_FOLDER}#&&#startup-script=$(cat setup_workers__startup_script.sh)#&&#halite-max-upload-size=104857600" \ --no-restart-on-failure \ --no-service-account --no-scopes \ --maintenance-policy "TERMINATE" \ diff --git a/apiserver/apiserver/config.py b/apiserver/apiserver/config.py index 5bab8650..cb8fa3b0 100644 --- a/apiserver/apiserver/config.py +++ b/apiserver/apiserver/config.py @@ -16,6 +16,7 @@ # Flask settings # Max size of an upload, in bytes MAX_BOT_UPLOAD_SIZE = 20 * 1024 * 1024 +# Needs to match corresponding value in worker configuration MAX_COMPILED_BOT_UPLOAD_SIZE = 100 * 1024 * 1024 # Secret key for Flask session cookies FLASK_SECRET_KEY = "" diff --git a/apiserver/worker/backend.py b/apiserver/worker/backend.py index 77ebbabb..2a23c052 100644 --- a/apiserver/worker/backend.py +++ b/apiserver/worker/backend.py @@ -3,7 +3,15 @@ from hashlib import md5 import json import os -from time import gmtime, strftime +from time import gmtime, strftime, sleep + + +# Needs to match corresponding value in apiserver/config.py +# This is the default value, 100 MiB +MAX_BOT_UPLOAD_SIZE = 100 * 1024 * 1024 +# Maximum wait time in between compiled bot archive upload attempts, +# in seconds +MAX_UPLOAD_BACKOFF = 32 with open("config.json") as configfile: @@ -11,6 +19,9 @@ MANAGER_URL = config["MANAGER_URL"] SECRET_FOLDER = config["SECRET_FOLDER"] CAPABILITIES = config.get("CAPABILITIES", []) + provided_size = config.get("MAX_BOT_UPLOAD_SIZE", MAX_BOT_UPLOAD_SIZE) + if provided_size: + MAX_BOT_UPLOAD_SIZE = provided_size def getTask(): @@ -84,10 +95,14 @@ def storeBotLocally(user_id, bot_id, storage_dir, is_compile=False): def storeBotRemotely(user_id, bot_id, zip_file_path): """Posts a bot file to the manager""" zip_contents = open(zip_file_path, "rb").read() + if len(zip_contents) > MAX_BOT_UPLOAD_SIZE: + raise RuntimeError("Bot archive exceeds maximum size of 100 MiB.") + iterations = 0 local_hash = md5(zip_contents).hexdigest() + backoff = 1 - while iterations < 100: + while iterations < 10: r = requests.post(MANAGER_URL+"botFile", data={ "user_id": str(user_id), @@ -95,11 +110,18 @@ def storeBotRemotely(user_id, bot_id, zip_file_path): }, files={"bot.zip": zip_contents}) print("Posting compiled bot archive %s\n" % r.text) + if r.status_code >= 400 and r.status_code <= 499: + print("Got a 4xx status code") + r.raise_for_status() # Try again if local and remote hashes differ if local_hash != getBotHash(user_id, bot_id): print("Hashes do not match! Redoing file upload...\n") iterations += 1 + sleep(backoff) + if backoff < MAX_UPLOAD_BACKOFF: + backoff *= 2 + continue return diff --git a/apiserver/worker/grab_config.py b/apiserver/worker/grab_config.py index 1e1f2d9c..7bdd47e6 100644 --- a/apiserver/worker/grab_config.py +++ b/apiserver/worker/grab_config.py @@ -8,6 +8,7 @@ MANAGER_URL_METADATA_URL = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/halite-manager-url" SECRET_FOLDER_METADATA_URL = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/halite-secret-folder" GPU_CAPABILITY_METADATA_URL = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/halite-gpu" +MAX_UPLOAD_SIZE_METADATA_URL = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/halite-max-upload-size" MANAGER_URL = requests.get(MANAGER_URL_METADATA_URL, headers={ "Metadata-Flavor": "Google" @@ -18,10 +19,20 @@ HAS_GPU = requests.get(GPU_CAPABILITY_METADATA_URL, headers={ "Metadata-Flavor": "Google" }).text == "true" +MAX_UPLOAD_SIZE = requests.get(GPU_CAPABILITY_METADATA_URL, headers={ + "Metadata-Flavor": "Google" +}).text + +try: + MAX_UPLOAD_SIZE = int(MAX_UPLOAD_SIZE) +except: + MAX_UPLOAD_SIZE = None + with open("config.json", "w") as configfile: json.dump({ "MANAGER_URL": MANAGER_URL, "SECRET_FOLDER": SECRET_FOLDER, "CAPABILITIES": ["gpu"] if HAS_GPU else [], + "MAX_BOT_UPLOAD_SIZE": MAX_UPLOAD_SIZE, }, configfile) diff --git a/apiserver/worker/worker.py b/apiserver/worker/worker.py index 80d6f597..c75ed2b8 100644 --- a/apiserver/worker/worker.py +++ b/apiserver/worker/worker.py @@ -62,6 +62,14 @@ """ +UPLOAD_ERROR_MESSAGE = """ +We had some trouble uploading your bot. If you cannot figure out why +this happened, please email us at halite@halite.io. We can help. + +For our reference, here is the trace of the error: +""" + + def makePath(path): """Deletes anything residing at path, creates path, and chmods the directory""" if os.path.exists(path): @@ -138,8 +146,9 @@ def executeCompileTask(user_id, bot_id, backend): try: if didCompile: logging.debug("Bot did compile\n") - archive.zipFolder(temp_dir, os.path.join(temp_dir, str(user_id)+".zip")) - backend.storeBotRemotely(user_id, bot_id, os.path.join(temp_dir, str(user_id)+".zip")) + archive_path = os.path.join(temp_dir, str(user_id)+".zip") + archive.zipFolder(temp_dir, archive_path) + backend.storeBotRemotely(user_id, bot_id, archive_path) else: logging.debug("Bot did not compile\n") logging.debug("Bot errors %s\n" % str(errors)) @@ -147,6 +156,12 @@ def executeCompileTask(user_id, bot_id, backend): backend.compileResult(user_id, bot_id, didCompile, language, errors=(None if didCompile else "\n".join(errors))) + except: + logging.debug("Bot did not upload\n") + traceback.print_exc() + errors.append(UPLOAD_ERROR_MESSAGE + traceback.format_exc()) + backend.compileResult(user_id, bot_id, False, language, + errors="\n".join(errors)) finally: # Remove files as bot user (Python will clean up tempdir, but we don't # necessarily have permissions to clean up files)