diff --git a/tensorflow/base-cpu/Dockerfile b/tensorflow/base-cpu/Dockerfile new file mode 100644 index 0000000..af1fc70 --- /dev/null +++ b/tensorflow/base-cpu/Dockerfile @@ -0,0 +1,89 @@ +FROM tensorflow/tensorflow:latest-py3 as mlbench-worker-base-cpu +# TODO: reduce size and complexity of image. + +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc \ + make \ + libc-dev \ + musl-dev \ + openssh-server \ + g++ \ + git \ + curl \ + sudo \ + iproute2 + +# -------------------- SSH -------------------- +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +ARG SSH_USER=root +ENV SSH_USER=$SSH_USER +RUN mkdir -p /ssh-key/$SSH_USER && chown -R $SSH_USER:$SSH_USER /ssh-key/$SSH_USER +RUN mkdir -p /.sshd/host_keys && \ + chown -R $SSH_USER:$SSH_USER /.sshd/host_keys && chmod 700 /.sshd/host_keys +RUN mkdir -p /.sshd/user_keys/$SSH_USER && \ + chown -R $SSH_USER:$SSH_USER /.sshd/user_keys/$SSH_USER && chmod 700 /.sshd/user_keys/$SSH_USER +VOLUME /ssh-key/$SSH_USER + +# -------------------- Conda environment -------------------- +RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh && \ + sh ~/miniconda.sh -b -p /conda && rm ~/miniconda.sh +ENV PATH /conda/bin:$PATH +ENV LD_LIBRARY_PATH /conda/lib:$LD_LIBRARY_PATH + +# TODO: Source code in Channel Anaconda can be outdated, switch to conda-forge if posible. +RUN conda install -y -c anaconda numpy pyyaml scipy mkl setuptools cmake cffi mkl-include typing \ + && conda install -y -c mingfeima mkldnn \ + && conda install -y -c soumith magma-cuda90 \ + && conda install -y -c conda-forge python-lmdb opencv numpy \ + && conda clean --all -y + +# -------------------- Open MPI -------------------- +RUN mkdir /.openmpi/ +RUN apt-get update && apt-get install -y --no-install-recommends wget \ + && wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz\ + && gunzip -c openmpi-3.0.0.tar.gz | tar xf - \ + && cd openmpi-3.0.0 \ + && ./configure --prefix=/.openmpi/ \ + && make all install \ + && rm /openmpi-3.0.0.tar.gz \ + && rm -rf /openmpi-3.0.0 \ + && apt-get remove -y wget + +ENV PATH /.openmpi/bin:$PATH +ENV LD_LIBRARY_PATH /.openmpi/lib:$LD_LIBRARY_PATH + +RUN mv /.openmpi/bin/mpirun /.openmpi/bin/mpirun.real && \ + echo '#!/bin/bash' > /.openmpi/bin/mpirun && \ + echo "/.openmpi/bin/mpirun.real" '--allow-run-as-root "$@"' >> /.openmpi/bin/mpirun && \ + chmod a+x /.openmpi/bin/mpirun + +# Configure OpenMPI to run good defaults: +# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 +RUN echo "hwloc_base_binding_policy = none" >> /.openmpi/etc/openmpi-mca-params.conf && \ + echo "rmaps_base_mapping_policy = slot" >> /.openmpi/etc/openmpi-mca-params.conf && \ + echo "btl_tcp_if_exclude = lo,docker0" >> /.openmpi/etc/openmpi-mca-params.conf + +# configure the path. +RUN echo export 'PATH=$HOME/conda/envs/pytorch-py$PYTHON_VERSION/bin:$HOME/.openmpi/bin:$PATH' >> ~/.bashrc +RUN echo export 'LD_LIBRARY_PATH=$HOME/.openmpi/lib:$LD_LIBRARY_PATH' >> ~/.bashrc + +RUN conda install -y -c conda-forge mpi4py +# -------------------- TensorFlow Related -------------------- +RUN conda install tensorflow + +# -------------------- Others -------------------- +RUN echo "orte_keep_fqdn_hostnames=t" >> /.openmpi/etc/openmpi-mca-params.conf + +ADD ./entrypoint.sh /usr/local/bin/ +RUN chmod a+x /usr/local/bin/entrypoint.sh + +# Copy your application code to the container (make sure you create a .dockerignore file if any large files or directories should be excluded) +RUN mkdir /app/ +WORKDIR /app/ + +EXPOSE 22 +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] +CMD ["/usr/sbin/sshd","-eD", "-f", "/.sshd/user_keys/root/sshd_config"] diff --git a/tensorflow/base-cpu/entrypoint.sh b/tensorflow/base-cpu/entrypoint.sh new file mode 100644 index 0000000..4a71887 --- /dev/null +++ b/tensorflow/base-cpu/entrypoint.sh @@ -0,0 +1,103 @@ +#!/bin/sh + +PERMIT_ROOT_LOGIN=yes +MY_NAME=root + +ssh-keygen -f /.sshd/host_keys/host_rsa_key -C '' -N '' -t rsa +ssh-keygen -f /.sshd/host_keys/host_dsa_key -C '' -N '' -t dsa + +create_ssh_key() { + user=$1 + mkdir -p /.sshd/user_keys/$user + chmod 700 /.sshd/user_keys/$user + chown $user:$user /.sshd/user_keys/$user + if ! [ -z "$(ls -A /ssh-key/root)" ]; then + cp /ssh-key/root/* /.sshd/user_keys/$user/ + chmod 600 /.sshd/user_keys/$user/* + chown $user:$user /.sshd/user_keys/$user/* + fi +} + +create_ssh_key $MY_NAME + +# generating sshd_config +cat << EOT > /.sshd/user_keys/$MY_NAME/sshd_config +# Package generated configuration file +# See the sshd_config(5) manpage for details +# What ports, IPs and protocols we listen for +Port 22 +# Use these options to restrict which interfaces/protocols sshd will bind to +#ListenAddress :: +#ListenAddress 0.0.0.0 +Protocol 2 +PidFile /.sshd/user_keys/$MY_NAME/sshd.pid +# HostKeys for protocol version 2 +HostKey /.sshd/host_keys/host_rsa_key +HostKey /.sshd/host_keys/host_dsa_key +#Privilege Separation is turned on for security +UsePrivilegeSeparation no +# Lifetime and size of ephemeral version 1 server key +KeyRegenerationInterval 3600 +ServerKeyBits 768 +# Logging +SyslogFacility AUTH +LogLevel INFO +# Authentication: +LoginGraceTime 120 +PermitRootLogin $PERMIT_ROOT_LOGIN +StrictModes yes +RSAAuthentication yes +PubkeyAuthentication yes +AuthorizedKeysFile /.sshd/user_keys/%u/authorized_keys +# Don't read the user's ~/.rhosts and ~/.shosts files +IgnoreRhosts yes +# For this to work you will also need host keys in /etc/ssh_known_hosts +RhostsRSAAuthentication no +# similar for protocol version 2 +HostbasedAuthentication no +# Uncomment if you don't trust ~/.ssh/known_hosts for RhostsRSAAuthentication +#IgnoreUserKnownHosts yes +# To enable empty passwords, change to yes (NOT RECOMMENDED) +PermitEmptyPasswords no +# Change to yes to enable challenge-response passwords (beware issues with +# some PAM modules and threads) +ChallengeResponseAuthentication no +X11Forwarding yes +X11DisplayOffset 10 +PrintMotd no +PrintLastLog yes +TCPKeepAlive yes +#UseLogin no +# Allow client to pass locale environment variables +AcceptEnv LANG LC_* +Subsystem sftp /usr/lib/openssh/sftp-server +# Set this to 'yes' to enable PAM authentication, account processing, +# and session processing. If this is enabled, PAM authentication will +# be allowed through the ChallengeResponseAuthentication and +# PasswordAuthentication. Depending on your PAM configuration, +# PAM authentication via ChallengeResponseAuthentication may bypass +# the setting of "PermitRootLogin without-password". +# If you just want the PAM account and session checks to run without +# PAM authentication, then enable this but set PasswordAuthentication +# and ChallengeResponseAuthentication to 'no'. +UsePAM no +# we need this to set various variables (LD_LIBRARY_PATH etc.) for users +# since sshd wipes all previously set environment variables when opening +# a new session +PermitUserEnvironment yes +EOT + +#cat << EOT > /$MY_NAME/.ssh/config +cat << EOT > /etc/ssh/ssh_config +StrictHostKeyChecking no +IdentityFile /.sshd/user_keys/$MY_NAME/id_rsa +Port 22 +UserKnownHostsFile=/dev/null +EOT + +#prepare run dir +if [ ! -d "/var/run/sshd" ]; then + mkdir -p /var/run/sshd +fi +# EOT +exec "$@" \ No newline at end of file diff --git a/tensorflow/base-mpi/.Dockerfile.swo b/tensorflow/base-mpi/.Dockerfile.swo new file mode 100644 index 0000000..5502463 Binary files /dev/null and b/tensorflow/base-mpi/.Dockerfile.swo differ diff --git a/tensorflow/base-mpi/Dockerfile b/tensorflow/base-mpi/Dockerfile new file mode 100644 index 0000000..36f39ef --- /dev/null +++ b/tensorflow/base-mpi/Dockerfile @@ -0,0 +1,126 @@ +FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 as mlbench-worker-base +# TODO: reduce size and complexity of image. + +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc \ + make \ + libc-dev \ + musl-dev \ + openssh-server \ + g++ \ + git \ + curl \ + sudo \ + iproute2 + +# -------------------- SSH -------------------- +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +ARG SSH_USER=root +ENV SSH_USER=$SSH_USER +RUN mkdir -p /ssh-key/$SSH_USER && chown -R $SSH_USER:$SSH_USER /ssh-key/$SSH_USER +RUN mkdir -p /.sshd/host_keys && \ + chown -R $SSH_USER:$SSH_USER /.sshd/host_keys && chmod 700 /.sshd/host_keys +RUN mkdir -p /.sshd/user_keys/$SSH_USER && \ + chown -R $SSH_USER:$SSH_USER /.sshd/user_keys/$SSH_USER && chmod 700 /.sshd/user_keys/$SSH_USER +VOLUME /ssh-key/$SSH_USER + +# -----–––---------------------- Cuda Dependency -------------------- +RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list +RUN apt-get update && apt-get install -y --no-install-recommends --allow-downgrades \ + --allow-change-held-packages \ + libnccl2=2.0.5-3+cuda9.0 \ + libnccl-dev=2.0.5-3+cuda9.0 &&\ + rm -rf /var/lib/apt/lists/* + +# -------------------- Conda environment -------------------- +RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh && \ + sh ~/miniconda.sh -b -p /conda && rm ~/miniconda.sh +ENV PATH /conda/bin:$PATH +ENV LD_LIBRARY_PATH /conda/lib:$LD_LIBRARY_PATH + +# TODO: Source code in Channel Anaconda can be outdated, switch to conda-forge if posible. +RUN conda install -y -c anaconda numpy pyyaml scipy mkl setuptools cmake cffi mkl-include typing \ + && conda install -y -c mingfeima mkldnn \ + && conda install -y -c soumith magma-cuda90 \ + && conda install -y -c conda-forge python-lmdb opencv numpy \ + && conda clean --all -y + +# -------------------- Open MPI -------------------- +RUN mkdir /.openmpi/ +RUN apt-get update && apt-get install -y --no-install-recommends wget \ + && wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz\ + && gunzip -c openmpi-3.0.0.tar.gz | tar xf - \ + && cd openmpi-3.0.0 \ + && ./configure --prefix=/.openmpi/ --with-cuda\ + && make all install \ + && rm /openmpi-3.0.0.tar.gz \ + && rm -rf /openmpi-3.0.0 \ + && apt-get remove -y wget + +ENV PATH /.openmpi/bin:$PATH +ENV LD_LIBRARY_PATH /.openmpi/lib:$LD_LIBRARY_PATH + +RUN mv /.openmpi/bin/mpirun /.openmpi/bin/mpirun.real && \ + echo '#!/bin/bash' > /.openmpi/bin/mpirun && \ + echo "/.openmpi/bin/mpirun.real" '--allow-run-as-root "$@"' >> /.openmpi/bin/mpirun && \ + chmod a+x /.openmpi/bin/mpirun + +# Configure OpenMPI to run good defaults: +# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 +RUN echo "hwloc_base_binding_policy = none" >> /.openmpi/etc/openmpi-mca-params.conf && \ + echo "rmaps_base_mapping_policy = slot" >> /.openmpi/etc/openmpi-mca-params.conf && \ + echo "btl_tcp_if_exclude = lo,docker0" >> /.openmpi/etc/openmpi-mca-params.conf + +# configure the path. +RUN echo export 'PATH=$HOME/conda/envs/pytorch-py$PYTHON_VERSION/bin:$HOME/.openmpi/bin:$PATH' >> ~/.bashrc +RUN echo export 'LD_LIBRARY_PATH=$HOME/.openmpi/lib:$LD_LIBRARY_PATH' >> ~/.bashrc + +RUN conda install -y -c conda-forge mpi4py + +# -------- Build Tensorflow with MPI support------ +# source install instructions https://www.tensorflow.org/install/source +# pip six numpy wheel setuptools already installed +RUN pip install mock && \ + pip install keras_applications==1.0.6 --no-deps && \ + pip install keras_preprocessing==1.0.5 --no-deps + +# install bazel +RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list && \ + curl https://bazel.build/bazel-release.pub.gpg | sudo apt-key add - && \ + apt-get update && apt-get -y install bazel && \ + apt-get install --only-upgrade bazel + +# clone tensorflow repo +WORKDIR /tmp +RUN git clone https://github.com/tensorflow/tensorflow.git + +# configure for CUDA 9.0 with mpi +WORKDIR /tmp/tensorflow +RUN echo '\n''\n'n'\n''\n'N'\n'y'\n'9'\n''\n''\n''\n'N'\n''\n''\n'N'\n''\n'y'\n''\n''\n'N'\n' | ./configure + +# build package with bazel +RUN bazel build --config=opt --config=cuda //tensorflow/tools/pip_package:build_pip_package && \ + ./bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg + +# install with pip +RUN TFLOW_PKG_VER=$(ls /tmp/tensorflow_pkg/) && \ + pip install /tmp/tensorflow_pkg/$TFLOW_PKG_VER + +# -------------------- Others -------------------- +RUN echo "orte_keep_fqdn_hostnames=t" >> /.openmpi/etc/openmpi-mca-params.conf + +ADD ./entrypoint.sh /usr/local/bin/ +RUN chmod a+x /usr/local/bin/entrypoint.sh + +# Copy your application code to the container (make sure you create a .dockerignore file if any large files or directories should be excluded) +RUN mkdir /app/ +WORKDIR /app/ +RUN git clone https://github.com/mlbench/mlbench-benchmarks.git +RUN pip install -r /app/mlbench-benchmarks/tensorflow/imagerecognition/openmpi-cifar10-resnet20-all-reduce/requirements.txt + +EXPOSE 22 +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] +CMD ["/usr/sbin/sshd","-eD", "-f", "/.sshd/user_keys/root/sshd_config"] diff --git a/tensorflow/base-mpi/entrypoint.sh b/tensorflow/base-mpi/entrypoint.sh new file mode 100644 index 0000000..4a71887 --- /dev/null +++ b/tensorflow/base-mpi/entrypoint.sh @@ -0,0 +1,103 @@ +#!/bin/sh + +PERMIT_ROOT_LOGIN=yes +MY_NAME=root + +ssh-keygen -f /.sshd/host_keys/host_rsa_key -C '' -N '' -t rsa +ssh-keygen -f /.sshd/host_keys/host_dsa_key -C '' -N '' -t dsa + +create_ssh_key() { + user=$1 + mkdir -p /.sshd/user_keys/$user + chmod 700 /.sshd/user_keys/$user + chown $user:$user /.sshd/user_keys/$user + if ! [ -z "$(ls -A /ssh-key/root)" ]; then + cp /ssh-key/root/* /.sshd/user_keys/$user/ + chmod 600 /.sshd/user_keys/$user/* + chown $user:$user /.sshd/user_keys/$user/* + fi +} + +create_ssh_key $MY_NAME + +# generating sshd_config +cat << EOT > /.sshd/user_keys/$MY_NAME/sshd_config +# Package generated configuration file +# See the sshd_config(5) manpage for details +# What ports, IPs and protocols we listen for +Port 22 +# Use these options to restrict which interfaces/protocols sshd will bind to +#ListenAddress :: +#ListenAddress 0.0.0.0 +Protocol 2 +PidFile /.sshd/user_keys/$MY_NAME/sshd.pid +# HostKeys for protocol version 2 +HostKey /.sshd/host_keys/host_rsa_key +HostKey /.sshd/host_keys/host_dsa_key +#Privilege Separation is turned on for security +UsePrivilegeSeparation no +# Lifetime and size of ephemeral version 1 server key +KeyRegenerationInterval 3600 +ServerKeyBits 768 +# Logging +SyslogFacility AUTH +LogLevel INFO +# Authentication: +LoginGraceTime 120 +PermitRootLogin $PERMIT_ROOT_LOGIN +StrictModes yes +RSAAuthentication yes +PubkeyAuthentication yes +AuthorizedKeysFile /.sshd/user_keys/%u/authorized_keys +# Don't read the user's ~/.rhosts and ~/.shosts files +IgnoreRhosts yes +# For this to work you will also need host keys in /etc/ssh_known_hosts +RhostsRSAAuthentication no +# similar for protocol version 2 +HostbasedAuthentication no +# Uncomment if you don't trust ~/.ssh/known_hosts for RhostsRSAAuthentication +#IgnoreUserKnownHosts yes +# To enable empty passwords, change to yes (NOT RECOMMENDED) +PermitEmptyPasswords no +# Change to yes to enable challenge-response passwords (beware issues with +# some PAM modules and threads) +ChallengeResponseAuthentication no +X11Forwarding yes +X11DisplayOffset 10 +PrintMotd no +PrintLastLog yes +TCPKeepAlive yes +#UseLogin no +# Allow client to pass locale environment variables +AcceptEnv LANG LC_* +Subsystem sftp /usr/lib/openssh/sftp-server +# Set this to 'yes' to enable PAM authentication, account processing, +# and session processing. If this is enabled, PAM authentication will +# be allowed through the ChallengeResponseAuthentication and +# PasswordAuthentication. Depending on your PAM configuration, +# PAM authentication via ChallengeResponseAuthentication may bypass +# the setting of "PermitRootLogin without-password". +# If you just want the PAM account and session checks to run without +# PAM authentication, then enable this but set PasswordAuthentication +# and ChallengeResponseAuthentication to 'no'. +UsePAM no +# we need this to set various variables (LD_LIBRARY_PATH etc.) for users +# since sshd wipes all previously set environment variables when opening +# a new session +PermitUserEnvironment yes +EOT + +#cat << EOT > /$MY_NAME/.ssh/config +cat << EOT > /etc/ssh/ssh_config +StrictHostKeyChecking no +IdentityFile /.sshd/user_keys/$MY_NAME/id_rsa +Port 22 +UserKnownHostsFile=/dev/null +EOT + +#prepare run dir +if [ ! -d "/var/run/sshd" ]; then + mkdir -p /var/run/sshd +fi +# EOT +exec "$@" \ No newline at end of file diff --git a/tensorflow/base/Dockerfile b/tensorflow/base/Dockerfile index 303f686..4355b90 100644 --- a/tensorflow/base/Dockerfile +++ b/tensorflow/base/Dockerfile @@ -94,4 +94,4 @@ WORKDIR /app/ EXPOSE 22 ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] -CMD ["/usr/sbin/sshd","-eD", "-f", "/.sshd/user_keys/root/sshd_config"] \ No newline at end of file +CMD ["/usr/sbin/sshd","-eD", "-f", "/.sshd/user_keys/root/sshd_config"]