Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docker job launcher provision #3116

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion nvflare/app_opt/job_launcher/docker_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,10 @@ def get_command(self, job_meta, fl_ctx) -> (str, str):
class ClientDockerJobLauncher(DockerJobLauncher):
def get_command(self, job_meta, fl_ctx) -> (str, str):
job_id = job_meta.get(JobConstants.JOB_ID)
client_name = fl_ctx.get_identity_name()
command = generate_client_command(fl_ctx)

return f"client-{job_id}", command
return f"{client_name}-{job_id}", command


class ServerDockerJobLauncher(DockerJobLauncher):
Expand Down
7 changes: 6 additions & 1 deletion nvflare/lighter/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class PropKey:
OVERSEER_END_POINT = "overseer_end_point"
ADMIN_PORT = "admin_port"
FED_LEARN_PORT = "fed_learn_port"
DOCKER_COMM_PORT = "docker_comm_port"
CONN_SECURITY = "connection_security"
CUSTOM_CA_CERT = "custom_ca_cert"

Expand Down Expand Up @@ -86,6 +87,7 @@ class OverseerRole:
class TemplateSectionKey:
START_SERVER_SH = "start_svr_sh"
START_CLIENT_SH = "start_cln_sh"
DOCKER_BUILD_SH = "docker_build_sh"
DOCKER_SERVER_SH = "docker_svr_sh"
DOCKER_CLIENT_SH = "docker_cln_sh"
DOCKER_ADMIN_SH = "docker_adm_sh"
Expand All @@ -96,6 +98,7 @@ class TemplateSectionKey:
SUB_START_SH = "sub_start_sh"
STOP_FL_SH = "stop_fl_sh"
LOG_CONFIG = "log_config"
COMM_CONFIG = "comm_config"
LOCAL_SERVER_RESOURCES = "local_server_resources"
LOCAL_CLIENT_RESOURCES = "local_client_resources"
SAMPLE_PRIVACY = "sample_privacy"
Expand All @@ -119,11 +122,13 @@ class ProvFileName:
START_SH = "start.sh"
SUB_START_SH = "sub_start.sh"
PRIVILEGE_YML = "privilege.yml"
DOCKER_SH = "docker.sh"
DOCKER_BUILD_SH = "docker_build.sh"
DOCKER_SH = "start_docker.sh"
GUNICORN_CONF_PY = "gunicorn.conf.py"
FED_SERVER_JSON = "fed_server.json"
FED_CLIENT_JSON = "fed_client.json"
STOP_FL_SH = "stop_fl.sh"
COMM_CONFIG = "comm_config.json"
LOG_CONFIG_DEFAULT = "log_config.json.default"
RESOURCES_JSON_DEFAULT = "resources.json.default"
PRIVACY_JSON_SAMPLE = "privacy.json.sample"
Expand Down
9 changes: 9 additions & 0 deletions nvflare/lighter/dummy_project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ participants:
org: nvidia
fed_learn_port: 8002
admin_port: 8003
docker_comm_port: 8005
- name: site-1
type: client
org: nvidia
Expand All @@ -19,6 +20,7 @@ participants:
# as a server for 3rd-party integration.
# The value must be a hostname that the external trainer can reach via the network.
# listening_host: site-1-lh
docker_comm_port: 8006
- name: site-2
type: client
org: nvidia
Expand Down Expand Up @@ -65,5 +67,12 @@ builders:
args:
sp_end_point: server1:8002:8003

docker_image: localhost/nvflare:0.0.1

- path: nvflare.lighter.impl.docker.DockerBuilder
args:
docker_image: localhost/nvflare:0.0.1
base_image: python:3.10
requirements_file: docker_compose_requirements.txt
- path: nvflare.lighter.impl.cert.CertBuilder
- path: nvflare.lighter.impl.signature.SignatureBuilder
70 changes: 66 additions & 4 deletions nvflare/lighter/impl/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,24 @@
# limitations under the License.

import copy
import json
import os
import shutil

import yaml

from nvflare.lighter.constants import CtxKey, ProvFileName, TemplateSectionKey
from nvflare.app_opt.job_launcher.docker_launcher import ClientDockerJobLauncher, ServerDockerJobLauncher
from nvflare.lighter import utils
from nvflare.lighter.constants import CtxKey, PropKey, ProvFileName, TemplateSectionKey
from nvflare.lighter.spec import Builder, Project, ProvisionContext


class DockerBuilder(Builder):
def __init__(self, base_image="python:3.8", requirements_file="requirements.txt"):
def __init__(
self, docker_image="nvflare-docker:0.0.1", base_image="python:3.8", requirements_file="requirements.txt"
):
"""Build docker compose file."""
self.docker_image = docker_image
self.base_image = base_image
self.requirements_file = requirements_file
self.services = {}
Expand Down Expand Up @@ -57,7 +63,31 @@ def _build_server(self, server, ctx: ProvisionContext):
info_dict["container_name"] = server.name
self.services[server.name] = info_dict

def _build_client(self, client):
# local folder creation
dest_dir = ctx.get_local_dir(server)
with open(os.path.join(dest_dir, ProvFileName.RESOURCES_JSON_DEFAULT), "rt") as f:
resources = json.load(f)
resources["components"].append(
{
"id": "docker_launcher",
"path": ServerDockerJobLauncher().__module__ + "." + "ServerDockerJobLauncher",
"args": {},
}
)
utils.write(os.path.join(dest_dir, ProvFileName.RESOURCES_JSON_DEFAULT), json.dumps(resources, indent=4), "t")

communication_port = server.get_prop(CtxKey.DOCKER_COMM_PORT)
if communication_port:
replacement_dict = {"comm_host_name": "server-parent", "communication_port": communication_port}
ctx.build_from_template(
dest_dir,
TemplateSectionKey.COMM_CONFIG,
ProvFileName.COMM_CONFIG,
replacement=replacement_dict,
exe=True,
)

def _build_client(self, client, ctx: ProvisionContext):
info_dict = copy.deepcopy(self.services["__flclient__"])
info_dict["volumes"] = [f"./{client.name}:" + "${WORKSPACE}"]
info_dict["build"] = "nvflare_compose"
Expand All @@ -71,6 +101,30 @@ def _build_client(self, client):
info_dict["container_name"] = client.name
self.services[client.name] = info_dict

# local folder creation
dest_dir = ctx.get_local_dir(client)
with open(os.path.join(dest_dir, ProvFileName.RESOURCES_JSON_DEFAULT), "rt") as f:
resources = json.load(f)
resources["components"].append(
{
"id": "docker_launcher",
"path": ClientDockerJobLauncher().__module__ + "." + "ClientDockerJobLauncher",
"args": {},
}
)
utils.write(os.path.join(dest_dir, ProvFileName.RESOURCES_JSON_DEFAULT), json.dumps(resources, indent=4), "t")

communication_port = client.get_prop(PropKey.DOCKER_COMM_PORT)
if communication_port:
replacement_dict = {"comm_host_name": client.name + "-parent", "communication_port": communication_port}
ctx.build_from_template(
dest_dir,
TemplateSectionKey.COMM_CONFIG,
ProvFileName.COMM_CONFIG,
replacement=replacement_dict,
exe=True,
)

def build(self, project: Project, ctx: ProvisionContext):
compose = ctx.yaml_load_template_section(TemplateSectionKey.COMPOSE_YAML)
self.services = compose.get("services")
Expand All @@ -83,7 +137,7 @@ def build(self, project: Project, ctx: ProvisionContext):
self._build_server(server, ctx)

for client in project.get_clients():
self._build_client(client)
self._build_client(client, ctx)

self.services.pop("__overseer__", None)
self.services.pop("__flserver__", None)
Expand All @@ -101,6 +155,14 @@ def build(self, project: Project, ctx: ProvisionContext):
with open(os.path.join(compose_build_dir, ProvFileName.DOCKERFILE), "wt") as f:
f.write(f"FROM {self.base_image}\n")
f.write(ctx.get_template_section(TemplateSectionKey.DOCKERFILE))
replacement_dict = {"image": self.docker_image}
ctx.build_from_template(
compose_build_dir,
TemplateSectionKey.DOCKER_BUILD_SH,
ProvFileName.DOCKER_BUILD_SH,
replacement=replacement_dict,
exe=True,
)
try:
shutil.copyfile(self.requirements_file, os.path.join(compose_build_dir, ProvFileName.REQUIREMENTS_TXT))
except Exception:
Expand Down
9 changes: 9 additions & 0 deletions nvflare/lighter/impl/static_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ def _build_server(self, server: Participant, ctx: ProvisionContext):
server_0["name"] = project.name
admin_port = ctx.get(CtxKey.ADMIN_PORT)
fed_learn_port = ctx.get(CtxKey.FED_LEARN_PORT)
communication_port = server.get_prop(CtxKey.DOCKER_COMM_PORT)
server_0["service"]["target"] = f"{server.name}:{fed_learn_port}"
server_0["service"]["scheme"] = self.scheme
server_0["admin_host"] = server.name
Expand All @@ -155,6 +156,7 @@ def _build_server(self, server: Participant, ctx: ProvisionContext):
replacement_dict = {
"admin_port": admin_port,
"fed_learn_port": fed_learn_port,
"communication_port": communication_port,
"config_folder": self.config_folder,
"docker_image": self.docker_image,
"org_name": server.org,
Expand Down Expand Up @@ -214,7 +216,14 @@ def _build_client(self, client, ctx):
config["servers"][0]["service"]["scheme"] = self.scheme
config["servers"][0]["name"] = project.name
config["servers"][0]["identity"] = server.name # the official identity of the server
admin_port = ctx.get(CtxKey.ADMIN_PORT)
fed_learn_port = ctx.get(CtxKey.FED_LEARN_PORT)
communication_port = client.get_prop(PropKey.DOCKER_COMM_PORT)
replacement_dict = {
"admin_port": admin_port,
"fed_learn_port": fed_learn_port,
"communication_port": communication_port,
"comm_host_name": client.name + "-parent",
"client_name": f"{client.subject}",
"config_folder": self.config_folder,
"docker_image": self.docker_image,
Expand Down
89 changes: 48 additions & 41 deletions nvflare/lighter/templates/master_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,17 @@ local_server_resources: |
]
}

comm_config: |
{
"internal": {
"scheme": "tcp",
"resources": {
"host": "{~~comm_host_name~~}",
"port": {~~communication_port~~}
}
}
}

fed_server: |
{
"format_version": 2,
Expand Down Expand Up @@ -575,60 +586,44 @@ sub_start_sh: |
docker_cln_sh: |
#!/usr/bin/env bash
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
# docker run script for FL client
# local data directory
: ${MY_DATA_DIR:="/home/flclient/data"}
# The syntax above is to set MY_DATA_DIR to /home/flcient/data if this
# environment variable is not set previously.
# Therefore, users can set their own MY_DATA_DIR with
# export MY_DATA_DIR=$SOME_DIRECTORY
# before running docker.sh

# for all gpus use line below
#GPU2USE='--gpus=all'
# for 2 gpus use line below
#GPU2USE='--gpus=2'
# for specific gpus as gpu#0 and gpu#2 use line below
#GPU2USE='--gpus="device=0,2"'
# to use host network, use line below
NETARG="--net=host"
# FL clients do not need to open ports, so the following line is not needed.
#NETARG="-p 443:443 -p 8003:8003"
# docker run script for FL server
DOCKER_IMAGE={~~docker_image~~}
echo "Starting docker with $DOCKER_IMAGE"
mode="${1:--r}"
if [ $mode = "-d" ]
then
docker run -d --rm --name={~~client_name~~} $GPU2USE -u $(id -u):$(id -g) \
-v /etc/passwd:/etc/passwd -v /etc/group:/etc/group -v $DIR/..:/workspace/ \
-v $MY_DATA_DIR:/data/:ro -w /workspace/ --ipc=host $NETARG $DOCKER_IMAGE \
/bin/bash -c "python -u -m nvflare.private.fed.app.client.client_train -m /workspace -s fed_client.json --set uid={~~client_name~~} secure_train=true config_folder=config org={~~org_name~~}"

NETWORK_NAME="nvflare-network"
if docker network ls --filter name=$NETWORK_NAME --format "{{.Name}}" | grep -wq $NETWORK_NAME; then
echo "Network '${NETWORK_NAME}' exists."
else
docker run --rm -it --name={~~client_name~~} $GPU2USE -u $(id -u):$(id -g) \
-v /etc/passwd:/etc/passwd -v /etc/group:/etc/group -v $DIR/..:/workspace/ \
-v $MY_DATA_DIR:/data/:ro -w /workspace/ --ipc=host $NETARG $DOCKER_IMAGE /bin/bash
docker network create $NETWORK_NAME
fi

docker run --name {~~comm_host_name~~} --network nvflare-network \
-v $DIR/..:/workspace \
-v /var/run/docker.sock:/var/run/docker.sock \
-p {~~communication_port~~}:{~~communication_port~~} \
-it --rm $DOCKER_IMAGE /bin/bash -c "export NVFL_DOCKER_WORKSPACE=$DIR/..;startup/sub_start.sh \
{~~client_name~~} server-parent:{~~fed_learn_port~~}:{~~admin_port~~}"

docker_svr_sh: |
#!/usr/bin/env bash
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
# docker run script for FL server
# to use host network, use line below
NETARG="--net=host"
# or to expose specific ports, use line below
#NETARG="-p {~~admin_port~~}:{~~admin_port~~} -p {~~fed_learn_port~~}:{~~fed_learn_port~~}"
DOCKER_IMAGE={~~docker_image~~}
echo "Starting docker with $DOCKER_IMAGE"
svr_name="${SVR_NAME:-flserver}"
mode="${1:-r}"
if [ $mode = "-d" ]
then
docker run -d --rm --name=$svr_name -v $DIR/..:/workspace/ -w /workspace \
--ipc=host $NETARG $DOCKER_IMAGE /bin/bash -c \
"python -u -m nvflare.private.fed.app.server.server_train -m /workspace -s fed_server.json --set secure_train=true config_folder=config org={~~org_name~~}"

NETWORK_NAME="nvflare-network"
if docker network ls --filter name=$NETWORK_NAME --format "{{.Name}}" | grep -wq $NETWORK_NAME; then
echo "Network '${NETWORK_NAME}' exists."
else
docker run --rm -it --name=$svr_name -v $DIR/..:/workspace/ -w /workspace/ --ipc=host $NETARG $DOCKER_IMAGE /bin/bash
docker network create $NETWORK_NAME
fi

docker run --name server-parent --network nvflare-network \
-v $DIR/..:/workspace \
-v /var/run/docker.sock:/var/run/docker.sock \
-p {~~admin_port~~}:{~~admin_port~~} -p {~~fed_learn_port~~}:{~~fed_learn_port~~} \
-p {~~communication_port~~}:{~~communication_port~~} \
-it --rm $DOCKER_IMAGE /bin/bash -c "export NVFL_DOCKER_WORKSPACE=$DIR/..;startup/sub_start.sh"

docker_adm_sh: |
#!/usr/bin/env bash
Expand Down Expand Up @@ -698,10 +693,22 @@ compose_yaml: |
nvflare_svc_persist:

dockerfile: |
RUN mkdir /opt/NVFlare
WORKDIR /opt/NVFlare
RUN pip install -U pip
RUN pip install nvflare
COPY requirements.txt requirements.txt
RUN pip install -r requirements.txt

RUN apt-get update
RUN apt install docker.io -y
WORKDIR /workspace

docker_build_sh: |
#!/usr/bin/env bash
docker image rm {~~image~~}
docker build -t {~~image~~} -f Dockerfile .
docker push {~~image~~}

helm_chart_chart: |
apiVersion: v2
Expand Down
Loading