From bc51db38988f77cb06308e18d5a00f46768e7993 Mon Sep 17 00:00:00 2001 From: Ali Sabet Sarvestani Date: Mon, 18 Mar 2019 14:30:07 +0100 Subject: [PATCH 1/5] Added tensorflow-cpu docker image --- tensorflow/base-cpu/Dockerfile | 89 ++++++++++++++++++++++++++ tensorflow/base-cpu/entrypoint.sh | 103 ++++++++++++++++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 tensorflow/base-cpu/Dockerfile create mode 100644 tensorflow/base-cpu/entrypoint.sh diff --git a/tensorflow/base-cpu/Dockerfile b/tensorflow/base-cpu/Dockerfile new file mode 100644 index 0000000..af1fc70 --- /dev/null +++ b/tensorflow/base-cpu/Dockerfile @@ -0,0 +1,89 @@ +FROM tensorflow/tensorflow:latest-py3 as mlbench-worker-base-cpu +# TODO: reduce size and complexity of image. + +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc \ + make \ + libc-dev \ + musl-dev \ + openssh-server \ + g++ \ + git \ + curl \ + sudo \ + iproute2 + +# -------------------- SSH -------------------- +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +ARG SSH_USER=root +ENV SSH_USER=$SSH_USER +RUN mkdir -p /ssh-key/$SSH_USER && chown -R $SSH_USER:$SSH_USER /ssh-key/$SSH_USER +RUN mkdir -p /.sshd/host_keys && \ + chown -R $SSH_USER:$SSH_USER /.sshd/host_keys && chmod 700 /.sshd/host_keys +RUN mkdir -p /.sshd/user_keys/$SSH_USER && \ + chown -R $SSH_USER:$SSH_USER /.sshd/user_keys/$SSH_USER && chmod 700 /.sshd/user_keys/$SSH_USER +VOLUME /ssh-key/$SSH_USER + +# -------------------- Conda environment -------------------- +RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh && \ + sh ~/miniconda.sh -b -p /conda && rm ~/miniconda.sh +ENV PATH /conda/bin:$PATH +ENV LD_LIBRARY_PATH /conda/lib:$LD_LIBRARY_PATH + +# TODO: Source code in Channel Anaconda can be outdated, switch to conda-forge if posible. +RUN conda install -y -c anaconda numpy pyyaml scipy mkl setuptools cmake cffi mkl-include typing \ + && conda install -y -c mingfeima mkldnn \ + && conda install -y -c soumith magma-cuda90 \ + && conda install -y -c conda-forge python-lmdb opencv numpy \ + && conda clean --all -y + +# -------------------- Open MPI -------------------- +RUN mkdir /.openmpi/ +RUN apt-get update && apt-get install -y --no-install-recommends wget \ + && wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz\ + && gunzip -c openmpi-3.0.0.tar.gz | tar xf - \ + && cd openmpi-3.0.0 \ + && ./configure --prefix=/.openmpi/ \ + && make all install \ + && rm /openmpi-3.0.0.tar.gz \ + && rm -rf /openmpi-3.0.0 \ + && apt-get remove -y wget + +ENV PATH /.openmpi/bin:$PATH +ENV LD_LIBRARY_PATH /.openmpi/lib:$LD_LIBRARY_PATH + +RUN mv /.openmpi/bin/mpirun /.openmpi/bin/mpirun.real && \ + echo '#!/bin/bash' > /.openmpi/bin/mpirun && \ + echo "/.openmpi/bin/mpirun.real" '--allow-run-as-root "$@"' >> /.openmpi/bin/mpirun && \ + chmod a+x /.openmpi/bin/mpirun + +# Configure OpenMPI to run good defaults: +# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 +RUN echo "hwloc_base_binding_policy = none" >> /.openmpi/etc/openmpi-mca-params.conf && \ + echo "rmaps_base_mapping_policy = slot" >> /.openmpi/etc/openmpi-mca-params.conf && \ + echo "btl_tcp_if_exclude = lo,docker0" >> /.openmpi/etc/openmpi-mca-params.conf + +# configure the path. +RUN echo export 'PATH=$HOME/conda/envs/pytorch-py$PYTHON_VERSION/bin:$HOME/.openmpi/bin:$PATH' >> ~/.bashrc +RUN echo export 'LD_LIBRARY_PATH=$HOME/.openmpi/lib:$LD_LIBRARY_PATH' >> ~/.bashrc + +RUN conda install -y -c conda-forge mpi4py +# -------------------- TensorFlow Related -------------------- +RUN conda install tensorflow + +# -------------------- Others -------------------- +RUN echo "orte_keep_fqdn_hostnames=t" >> /.openmpi/etc/openmpi-mca-params.conf + +ADD ./entrypoint.sh /usr/local/bin/ +RUN chmod a+x /usr/local/bin/entrypoint.sh + +# Copy your application code to the container (make sure you create a .dockerignore file if any large files or directories should be excluded) +RUN mkdir /app/ +WORKDIR /app/ + +EXPOSE 22 +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] +CMD ["/usr/sbin/sshd","-eD", "-f", "/.sshd/user_keys/root/sshd_config"] diff --git a/tensorflow/base-cpu/entrypoint.sh b/tensorflow/base-cpu/entrypoint.sh new file mode 100644 index 0000000..4a71887 --- /dev/null +++ b/tensorflow/base-cpu/entrypoint.sh @@ -0,0 +1,103 @@ +#!/bin/sh + +PERMIT_ROOT_LOGIN=yes +MY_NAME=root + +ssh-keygen -f /.sshd/host_keys/host_rsa_key -C '' -N '' -t rsa +ssh-keygen -f /.sshd/host_keys/host_dsa_key -C '' -N '' -t dsa + +create_ssh_key() { + user=$1 + mkdir -p /.sshd/user_keys/$user + chmod 700 /.sshd/user_keys/$user + chown $user:$user /.sshd/user_keys/$user + if ! [ -z "$(ls -A /ssh-key/root)" ]; then + cp /ssh-key/root/* /.sshd/user_keys/$user/ + chmod 600 /.sshd/user_keys/$user/* + chown $user:$user /.sshd/user_keys/$user/* + fi +} + +create_ssh_key $MY_NAME + +# generating sshd_config +cat << EOT > /.sshd/user_keys/$MY_NAME/sshd_config +# Package generated configuration file +# See the sshd_config(5) manpage for details +# What ports, IPs and protocols we listen for +Port 22 +# Use these options to restrict which interfaces/protocols sshd will bind to +#ListenAddress :: +#ListenAddress 0.0.0.0 +Protocol 2 +PidFile /.sshd/user_keys/$MY_NAME/sshd.pid +# HostKeys for protocol version 2 +HostKey /.sshd/host_keys/host_rsa_key +HostKey /.sshd/host_keys/host_dsa_key +#Privilege Separation is turned on for security +UsePrivilegeSeparation no +# Lifetime and size of ephemeral version 1 server key +KeyRegenerationInterval 3600 +ServerKeyBits 768 +# Logging +SyslogFacility AUTH +LogLevel INFO +# Authentication: +LoginGraceTime 120 +PermitRootLogin $PERMIT_ROOT_LOGIN +StrictModes yes +RSAAuthentication yes +PubkeyAuthentication yes +AuthorizedKeysFile /.sshd/user_keys/%u/authorized_keys +# Don't read the user's ~/.rhosts and ~/.shosts files +IgnoreRhosts yes +# For this to work you will also need host keys in /etc/ssh_known_hosts +RhostsRSAAuthentication no +# similar for protocol version 2 +HostbasedAuthentication no +# Uncomment if you don't trust ~/.ssh/known_hosts for RhostsRSAAuthentication +#IgnoreUserKnownHosts yes +# To enable empty passwords, change to yes (NOT RECOMMENDED) +PermitEmptyPasswords no +# Change to yes to enable challenge-response passwords (beware issues with +# some PAM modules and threads) +ChallengeResponseAuthentication no +X11Forwarding yes +X11DisplayOffset 10 +PrintMotd no +PrintLastLog yes +TCPKeepAlive yes +#UseLogin no +# Allow client to pass locale environment variables +AcceptEnv LANG LC_* +Subsystem sftp /usr/lib/openssh/sftp-server +# Set this to 'yes' to enable PAM authentication, account processing, +# and session processing. If this is enabled, PAM authentication will +# be allowed through the ChallengeResponseAuthentication and +# PasswordAuthentication. Depending on your PAM configuration, +# PAM authentication via ChallengeResponseAuthentication may bypass +# the setting of "PermitRootLogin without-password". +# If you just want the PAM account and session checks to run without +# PAM authentication, then enable this but set PasswordAuthentication +# and ChallengeResponseAuthentication to 'no'. +UsePAM no +# we need this to set various variables (LD_LIBRARY_PATH etc.) for users +# since sshd wipes all previously set environment variables when opening +# a new session +PermitUserEnvironment yes +EOT + +#cat << EOT > /$MY_NAME/.ssh/config +cat << EOT > /etc/ssh/ssh_config +StrictHostKeyChecking no +IdentityFile /.sshd/user_keys/$MY_NAME/id_rsa +Port 22 +UserKnownHostsFile=/dev/null +EOT + +#prepare run dir +if [ ! -d "/var/run/sshd" ]; then + mkdir -p /var/run/sshd +fi +# EOT +exec "$@" \ No newline at end of file From fc81fcb1429794ad536f74b4af70c5016f4eb1fe Mon Sep 17 00:00:00 2001 From: Ali Sabet Sarvestani Date: Tue, 19 Mar 2019 20:26:43 +0100 Subject: [PATCH 2/5] Created Docker base image for running tensorflow with mpi support --- tensorflow/base-mpi/Dockerfile | 125 ++++++++++++++++++++++++++++++ tensorflow/base-mpi/entrypoint.sh | 103 ++++++++++++++++++++++++ 2 files changed, 228 insertions(+) create mode 100644 tensorflow/base-mpi/Dockerfile create mode 100644 tensorflow/base-mpi/entrypoint.sh diff --git a/tensorflow/base-mpi/Dockerfile b/tensorflow/base-mpi/Dockerfile new file mode 100644 index 0000000..a2d48d7 --- /dev/null +++ b/tensorflow/base-mpi/Dockerfile @@ -0,0 +1,125 @@ +FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 as mlbench-worker-base +# TODO: reduce size and complexity of image. + +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc \ + make \ + libc-dev \ + musl-dev \ + openssh-server \ + g++ \ + git \ + curl \ + sudo \ + iproute2 + +# -------------------- SSH -------------------- +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +ARG SSH_USER=root +ENV SSH_USER=$SSH_USER +RUN mkdir -p /ssh-key/$SSH_USER && chown -R $SSH_USER:$SSH_USER /ssh-key/$SSH_USER +RUN mkdir -p /.sshd/host_keys && \ + chown -R $SSH_USER:$SSH_USER /.sshd/host_keys && chmod 700 /.sshd/host_keys +RUN mkdir -p /.sshd/user_keys/$SSH_USER && \ + chown -R $SSH_USER:$SSH_USER /.sshd/user_keys/$SSH_USER && chmod 700 /.sshd/user_keys/$SSH_USER +VOLUME /ssh-key/$SSH_USER + +# -----–––---------------------- Cuda Dependency -------------------- +RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list +RUN apt-get update && apt-get install -y --no-install-recommends --allow-downgrades \ + --allow-change-held-packages \ + libnccl2=2.0.5-3+cuda9.0 \ + libnccl-dev=2.0.5-3+cuda9.0 &&\ + rm -rf /var/lib/apt/lists/* + +# -------------------- Conda environment -------------------- +RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh && \ + sh ~/miniconda.sh -b -p /conda && rm ~/miniconda.sh +ENV PATH /conda/bin:$PATH +ENV LD_LIBRARY_PATH /conda/lib:$LD_LIBRARY_PATH + +# TODO: Source code in Channel Anaconda can be outdated, switch to conda-forge if posible. +RUN conda install -y -c anaconda numpy pyyaml scipy mkl setuptools cmake cffi mkl-include typing \ + && conda install -y -c mingfeima mkldnn \ + && conda install -y -c soumith magma-cuda90 \ + && conda install -y -c conda-forge python-lmdb opencv numpy \ + && conda clean --all -y + +# -------------------- Open MPI -------------------- +RUN mkdir /.openmpi/ +RUN apt-get update && apt-get install -y --no-install-recommends wget \ + && wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz\ + && gunzip -c openmpi-3.0.0.tar.gz | tar xf - \ + && cd openmpi-3.0.0 \ + && ./configure --prefix=/.openmpi/ --with-cuda\ + && make all install \ + && rm /openmpi-3.0.0.tar.gz \ + && rm -rf /openmpi-3.0.0 \ + && apt-get remove -y wget + +ENV PATH /.openmpi/bin:$PATH +ENV LD_LIBRARY_PATH /.openmpi/lib:$LD_LIBRARY_PATH + +RUN mv /.openmpi/bin/mpirun /.openmpi/bin/mpirun.real && \ + echo '#!/bin/bash' > /.openmpi/bin/mpirun && \ + echo "/.openmpi/bin/mpirun.real" '--allow-run-as-root "$@"' >> /.openmpi/bin/mpirun && \ + chmod a+x /.openmpi/bin/mpirun + +# Configure OpenMPI to run good defaults: +# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 +RUN echo "hwloc_base_binding_policy = none" >> /.openmpi/etc/openmpi-mca-params.conf && \ + echo "rmaps_base_mapping_policy = slot" >> /.openmpi/etc/openmpi-mca-params.conf && \ + echo "btl_tcp_if_exclude = lo,docker0" >> /.openmpi/etc/openmpi-mca-params.conf + +# configure the path. +RUN echo export 'PATH=$HOME/conda/envs/pytorch-py$PYTHON_VERSION/bin:$HOME/.openmpi/bin:$PATH' >> ~/.bashrc +RUN echo export 'LD_LIBRARY_PATH=$HOME/.openmpi/lib:$LD_LIBRARY_PATH' >> ~/.bashrc + +RUN conda install -y -c conda-forge mpi4py + +# -------- Build Tensorflow with MPI support------ +# source install instructions https://www.tensorflow.org/install/source +# pip six numpy wheel setuptools already installed +RUN pip install mock +RUN pip install keras_applications==1.0.6 --no-deps +RUN pip install keras_preprocessing==1.0.5 --no-deps + +# install bazel +RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list +RUN curl https://bazel.build/bazel-release.pub.gpg | sudo apt-key add - +RUN apt-get update && apt-get -y install bazel +RUN apt-get install --only-upgrade bazel + +# clone tensorflow repo +RUN git clone https://github.com/tensorflow/tensorflow.git +RUN cd tensorflow +RUN git fetch origin +refs/pull/26720/merge +RUN git checkout FETCH_HEAD + +# configure for CUDA 9.0 with mpi +RUN echo -ne '\n''\n'n'\n''\n'N'\n'y'\n'9'\n''\n''\n''\n'N'\n''\n''\n'N'\n''\n'y'\n''\n''\n'N'\n' | ./configure + +# build package with bazel +RUN bazel build --config=opt --config=cuda //tensorflow/tools/pip_package:build_pip_package +RUN ./bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg + +# install with pip +RUN TFLOW_PKG_VER=$(ls /tmp/tensorflow_pkg/) +RUN pip install /tmp/tensorflow_pkg/$TFLOW_PKG_VER + +# -------------------- Others -------------------- +RUN echo "orte_keep_fqdn_hostnames=t" >> /.openmpi/etc/openmpi-mca-params.conf + +ADD ./entrypoint.sh /usr/local/bin/ +RUN chmod a+x /usr/local/bin/entrypoint.sh + +# Copy your application code to the container (make sure you create a .dockerignore file if any large files or directories should be excluded) +RUN mkdir /app/ +WORKDIR /app/ + +EXPOSE 22 +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] +CMD ["/usr/sbin/sshd","-eD", "-f", "/.sshd/user_keys/root/sshd_config"] diff --git a/tensorflow/base-mpi/entrypoint.sh b/tensorflow/base-mpi/entrypoint.sh new file mode 100644 index 0000000..4a71887 --- /dev/null +++ b/tensorflow/base-mpi/entrypoint.sh @@ -0,0 +1,103 @@ +#!/bin/sh + +PERMIT_ROOT_LOGIN=yes +MY_NAME=root + +ssh-keygen -f /.sshd/host_keys/host_rsa_key -C '' -N '' -t rsa +ssh-keygen -f /.sshd/host_keys/host_dsa_key -C '' -N '' -t dsa + +create_ssh_key() { + user=$1 + mkdir -p /.sshd/user_keys/$user + chmod 700 /.sshd/user_keys/$user + chown $user:$user /.sshd/user_keys/$user + if ! [ -z "$(ls -A /ssh-key/root)" ]; then + cp /ssh-key/root/* /.sshd/user_keys/$user/ + chmod 600 /.sshd/user_keys/$user/* + chown $user:$user /.sshd/user_keys/$user/* + fi +} + +create_ssh_key $MY_NAME + +# generating sshd_config +cat << EOT > /.sshd/user_keys/$MY_NAME/sshd_config +# Package generated configuration file +# See the sshd_config(5) manpage for details +# What ports, IPs and protocols we listen for +Port 22 +# Use these options to restrict which interfaces/protocols sshd will bind to +#ListenAddress :: +#ListenAddress 0.0.0.0 +Protocol 2 +PidFile /.sshd/user_keys/$MY_NAME/sshd.pid +# HostKeys for protocol version 2 +HostKey /.sshd/host_keys/host_rsa_key +HostKey /.sshd/host_keys/host_dsa_key +#Privilege Separation is turned on for security +UsePrivilegeSeparation no +# Lifetime and size of ephemeral version 1 server key +KeyRegenerationInterval 3600 +ServerKeyBits 768 +# Logging +SyslogFacility AUTH +LogLevel INFO +# Authentication: +LoginGraceTime 120 +PermitRootLogin $PERMIT_ROOT_LOGIN +StrictModes yes +RSAAuthentication yes +PubkeyAuthentication yes +AuthorizedKeysFile /.sshd/user_keys/%u/authorized_keys +# Don't read the user's ~/.rhosts and ~/.shosts files +IgnoreRhosts yes +# For this to work you will also need host keys in /etc/ssh_known_hosts +RhostsRSAAuthentication no +# similar for protocol version 2 +HostbasedAuthentication no +# Uncomment if you don't trust ~/.ssh/known_hosts for RhostsRSAAuthentication +#IgnoreUserKnownHosts yes +# To enable empty passwords, change to yes (NOT RECOMMENDED) +PermitEmptyPasswords no +# Change to yes to enable challenge-response passwords (beware issues with +# some PAM modules and threads) +ChallengeResponseAuthentication no +X11Forwarding yes +X11DisplayOffset 10 +PrintMotd no +PrintLastLog yes +TCPKeepAlive yes +#UseLogin no +# Allow client to pass locale environment variables +AcceptEnv LANG LC_* +Subsystem sftp /usr/lib/openssh/sftp-server +# Set this to 'yes' to enable PAM authentication, account processing, +# and session processing. If this is enabled, PAM authentication will +# be allowed through the ChallengeResponseAuthentication and +# PasswordAuthentication. Depending on your PAM configuration, +# PAM authentication via ChallengeResponseAuthentication may bypass +# the setting of "PermitRootLogin without-password". +# If you just want the PAM account and session checks to run without +# PAM authentication, then enable this but set PasswordAuthentication +# and ChallengeResponseAuthentication to 'no'. +UsePAM no +# we need this to set various variables (LD_LIBRARY_PATH etc.) for users +# since sshd wipes all previously set environment variables when opening +# a new session +PermitUserEnvironment yes +EOT + +#cat << EOT > /$MY_NAME/.ssh/config +cat << EOT > /etc/ssh/ssh_config +StrictHostKeyChecking no +IdentityFile /.sshd/user_keys/$MY_NAME/id_rsa +Port 22 +UserKnownHostsFile=/dev/null +EOT + +#prepare run dir +if [ ! -d "/var/run/sshd" ]; then + mkdir -p /var/run/sshd +fi +# EOT +exec "$@" \ No newline at end of file From f7b0aff9ddd6198a453c4408de7e95c89e7e46ee Mon Sep 17 00:00:00 2001 From: Ali Sabet Sarvestani Date: Thu, 28 Mar 2019 16:46:25 +0100 Subject: [PATCH 3/5] Fixed Dockerfile git fetch error for tf-mpi fork repo --- tensorflow/base-mpi/.Dockerfile.swp | Bin 0 -> 16384 bytes tensorflow/base-mpi/Dockerfile | 36 +++++++++++++++------------- tensorflow/base/Dockerfile | 2 +- 3 files changed, 21 insertions(+), 17 deletions(-) create mode 100644 tensorflow/base-mpi/.Dockerfile.swp diff --git a/tensorflow/base-mpi/.Dockerfile.swp b/tensorflow/base-mpi/.Dockerfile.swp new file mode 100644 index 0000000000000000000000000000000000000000..33bb4d9260209dc49f3540dae2a4ae7702bc2697 GIT binary patch literal 16384 zcmeHOO>87b6>dTT;U|CqDH7tRGPc%PcDvi_^?G+LImEHOYbWa+%l2-FveBrfyJn{2 z?ygQ(x5u-J79?PAa3DZX1Q8q&NCD*p;TBGyAR(lX3j)Fcf)F=E#EI`!bx)7&@v;e* z2=yeNySuu6UVZiIRlQfe?dhv)>+GB}ZQ=J3%es31*ROTX-msoovn<8iBBS*G@(VmK z%vC1RQ>S$`M`gdED|Dl%S3Cca)onSjn^#%D&s?O4v~nxXaSp5osp~#ZmRu z=`{m21GmXQo^|YV_gpzWb4t(AiN<~GV;{Y8n}zFaH3KyRH3KyRH3KyRH3KyRH3KyR z{|^jg$sN`g(2+Ze&TJRgcOAICyLdicTuaZP%vU7Wmzsf^ftrDuftrDuftrDuftrDu zftrDuftrDufp;JS+_$VVkn8*8$m9IK^#5P_uw{J>_$u%)@E~voI0oDc+yT7#AZUSEc9tQ3Oes-s2Jqvsb2!Sc!6$C^S@EPFKz}>)i5JXu6 zR)J3ep9J0juf*;5-fo~|f+?A&o?t<&G9HHdn&w^>$XKynmL+Q5b^HCkQ+-6?$h7Mg z`CU`onqYSEV-Mv)=(FwNHy5lQWIeXNvBp%MBr?rRM$6i~a+!HD_W7`Jd%$eZq}d&` zA{ai^m`w&Iukd=3i9M=cZq|bA82iWFlhvydw%8`1sSgwe7m75GQnc{nrWH7n$*!F7ka_j2K=GI#CvfB>g z`2)@BB&z2?TNBi&PSDL$!LpuU3D0_tWikA;2#t0YwlXhi1)Y}I!}l(}U<)jiCw={y z%5;N7uX-mk&n-- zcC8|;(d&oOGc}{f{=S-H8H=}C^8|RXZK}%7B%>Pe2$}8nKubzb9%o}nswjdAE)6+u zLJdgcP+OYEtShC@e9_@~n5lWgyVr}5$WQEz@CpfBXb*~v&{)x4y>?Q;!v>qOZOj?j zw^738%1))sSYz_Rh93TL1r3wvt{L~KiTg;6=(D6A%8V7IB0t(4pJ7}>T0&FQTDg3c z9heIv6^)xJx(MmmWoL3V#o%pk4H@%-lh?=(+(Rd>S<@% znT8rs=gsT=ubhmhPWO71BjP)Pl{Sg4!?KQBO()wC6+KXM#8?Q%d7Z_R)@|T|C>V@MIK;*jJi)$VWCc+VO*QWZ*`Ob7`dM z%4m{kyF8j&-VX;EOv6HZq2MtZr(v)M7ukE{pRh8B|1$ zs{u<&L`z1*yAjuecz#50#+8U*fjT0HIQky?aa<5FE-R0@Nj8Ecg8`31rn~@Bp)gY- z%acsXP%%#{px5aH^ai9GyPOTkl%k0Vw%uHA&a*AEf_f50JBZm*kH@hH*<#s}2j<#> z$vmTR^-nSd?euzNoU|y#QWtbsB2~~11y+W!QAyR?nWSCgrltTf%2>S5snOM!mt z;L&a~J7F}BFuG64PliI%6U6_G6I%A(cK}O;{4AZe=##D@EoH>-M%2S$t55wIRc29&p)?sgl zcZDkSubjSbYeJXxljZ29+#l9zHVnRK>mk7I480K{3{dKnR>lEme!wv-5k@#l@=Ul9 z_j*ArY)s@dh61}ZaLUcwd7R~^=B8&|Gc3%->djo*Vi1RBf2SPc((zqWu^ojvp|mWo zfAJaf|KZ`G6t)D8*|LDW@$qdg8L`7Zd>q`obY*>oxk~ly9Wihxx3*SWSGHC*F@V_m z^Q&W)y(7BxHj0+tP2fh_U;k& z7dkvfVqNtSHV1t_LOZD^L6X9Xiy4a^RPIa2NVguDNpsr`vhwc4iQg9)D!7jQRqSIGijq+51=)bf z4#P6q#S(O3v$@X9ba%-{&cjI3VjRb3Nlp>krRi{{XBk(l{P4anQ+!!(liy*H|Nley z%scR-$^So&3;kY!fBzzI9asUDfmz@M`1NhT0qz3s1pW?x{zt$!0Rdb9R)7iM$MEZ4 z0KNg70bYlH|0m$r!1KTk@FZ{+_$&PTzW^@--v*uq?grk3Y<~mZ0R8~H2#~zr1?peT zK+QnSK+QnSK+QnSz&nqDrS;;!rS)a@cmqyQ>M9B;V#Db-PBv_@+&IY^c8C5Ra%^aa zuX7eV`oGE z+hlMW6L-X?G0zm6MSx|1NC|ea34+j^P8hKBM9c(n5Q>RY41vQ#P1*fA@S-SKjuOm6 zuodwz&fwqDKiUksvBWF-x0?(>6CMv($my_vQWTRZL;Op42-^pER6UvFJOTljBCzfs zFOD9n05FkuL@%wyk~<<^OPUA$dQYAB-FxN2)CDR z^#LcDH9=FVZsDIK2xFGfI5&%uk$?HxN_M*L% z$>Z=bAy!$rKGH#J#yfNnzrGHKfYh!!CUSKY%|)(O5f^dWbTrf&Ng3Cc_Na7)Oy@^` zjk~!Z6AA$p59E_D_m=I=A8t?g!c=5kNLlkzeVN?+gc&U0B&(T5YYa zEH2ZqkOxYyJ8-3YQ2(HdDC7wQ#c_C*hoL(&cXnpljRZ{*_`W(nqT$s@QL6DUbt@sA zzm!LRNa^Km9Z9MRjIi!dk;0?)bjd0k4h9lw#4lafQh{t+#^J!u6CK)OhJNZunelMU z48iZRw9yXTDwWahcPX%b=jb6W)G`%_c`4x}=tno{VhFVBPAwr?#JKMxv_GVv3a>rR zaqhRkBY$r8i89uvU{%OH#)(oFwI%_bxhCSxBdgm2^bdu literal 0 HcmV?d00001 diff --git a/tensorflow/base-mpi/Dockerfile b/tensorflow/base-mpi/Dockerfile index a2d48d7..0ff3af4 100644 --- a/tensorflow/base-mpi/Dockerfile +++ b/tensorflow/base-mpi/Dockerfile @@ -83,32 +83,34 @@ RUN conda install -y -c conda-forge mpi4py # -------- Build Tensorflow with MPI support------ # source install instructions https://www.tensorflow.org/install/source # pip six numpy wheel setuptools already installed -RUN pip install mock -RUN pip install keras_applications==1.0.6 --no-deps -RUN pip install keras_preprocessing==1.0.5 --no-deps +RUN pip install mock && \ + pip install keras_applications==1.0.6 --no-deps && \ + pip install keras_preprocessing==1.0.5 --no-deps # install bazel -RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list -RUN curl https://bazel.build/bazel-release.pub.gpg | sudo apt-key add - -RUN apt-get update && apt-get -y install bazel -RUN apt-get install --only-upgrade bazel +RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list && \ + curl https://bazel.build/bazel-release.pub.gpg | sudo apt-key add - && \ + apt-get update && apt-get -y install bazel && \ + apt-get install --only-upgrade bazel # clone tensorflow repo -RUN git clone https://github.com/tensorflow/tensorflow.git -RUN cd tensorflow -RUN git fetch origin +refs/pull/26720/merge -RUN git checkout FETCH_HEAD +WORKDIR /tmp +RUN git clone https://github.com/tensorflow/tensorflow.git && \ + cd tensorflow && \ + git fetch origin +refs/pull/26720/merge && \ + git checkout FETCH_HEAD # configure for CUDA 9.0 with mpi -RUN echo -ne '\n''\n'n'\n''\n'N'\n'y'\n'9'\n''\n''\n''\n'N'\n''\n''\n'N'\n''\n'y'\n''\n''\n'N'\n' | ./configure +WORKDIR /tmp/tensorflow +RUN echo '\n''\n'n'\n''\n'N'\n'y'\n'9'\n''\n''\n''\n'N'\n''\n''\n'N'\n''\n'y'\n''\n''\n'N'\n' | ./configure # build package with bazel -RUN bazel build --config=opt --config=cuda //tensorflow/tools/pip_package:build_pip_package -RUN ./bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg +RUN bazel build --config=opt --config=cuda //tensorflow/tools/pip_package:build_pip_package && \ + ./bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg # install with pip -RUN TFLOW_PKG_VER=$(ls /tmp/tensorflow_pkg/) -RUN pip install /tmp/tensorflow_pkg/$TFLOW_PKG_VER +RUN TFLOW_PKG_VER=$(ls /tmp/tensorflow_pkg/) && \ + pip install /tmp/tensorflow_pkg/$TFLOW_PKG_VER # -------------------- Others -------------------- RUN echo "orte_keep_fqdn_hostnames=t" >> /.openmpi/etc/openmpi-mca-params.conf @@ -119,6 +121,8 @@ RUN chmod a+x /usr/local/bin/entrypoint.sh # Copy your application code to the container (make sure you create a .dockerignore file if any large files or directories should be excluded) RUN mkdir /app/ WORKDIR /app/ +RUN git clone https://github.com/mlbench/mlbench-benchmarks.git +RUN pip install -r /app/mlbench-benchmarks/tensorflow/imagerecognition/openmpi-cifar10-resnet20-all-reduce/requirements.txt EXPOSE 22 ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] diff --git a/tensorflow/base/Dockerfile b/tensorflow/base/Dockerfile index 303f686..4355b90 100644 --- a/tensorflow/base/Dockerfile +++ b/tensorflow/base/Dockerfile @@ -94,4 +94,4 @@ WORKDIR /app/ EXPOSE 22 ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] -CMD ["/usr/sbin/sshd","-eD", "-f", "/.sshd/user_keys/root/sshd_config"] \ No newline at end of file +CMD ["/usr/sbin/sshd","-eD", "-f", "/.sshd/user_keys/root/sshd_config"] From f4c149d30d84aa87468e6b0536730b5e24f3b41f Mon Sep 17 00:00:00 2001 From: Ali Sabet Date: Thu, 28 Mar 2019 16:48:00 +0100 Subject: [PATCH 4/5] Delete .Dockerfile.swp --- tensorflow/base-mpi/.Dockerfile.swp | Bin 16384 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tensorflow/base-mpi/.Dockerfile.swp diff --git a/tensorflow/base-mpi/.Dockerfile.swp b/tensorflow/base-mpi/.Dockerfile.swp deleted file mode 100644 index 33bb4d9260209dc49f3540dae2a4ae7702bc2697..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeHOO>87b6>dTT;U|CqDH7tRGPc%PcDvi_^?G+LImEHOYbWa+%l2-FveBrfyJn{2 z?ygQ(x5u-J79?PAa3DZX1Q8q&NCD*p;TBGyAR(lX3j)Fcf)F=E#EI`!bx)7&@v;e* z2=yeNySuu6UVZiIRlQfe?dhv)>+GB}ZQ=J3%es31*ROTX-msoovn<8iBBS*G@(VmK z%vC1RQ>S$`M`gdED|Dl%S3Cca)onSjn^#%D&s?O4v~nxXaSp5osp~#ZmRu z=`{m21GmXQo^|YV_gpzWb4t(AiN<~GV;{Y8n}zFaH3KyRH3KyRH3KyRH3KyRH3KyR z{|^jg$sN`g(2+Ze&TJRgcOAICyLdicTuaZP%vU7Wmzsf^ftrDuftrDuftrDuftrDu zftrDuftrDufp;JS+_$VVkn8*8$m9IK^#5P_uw{J>_$u%)@E~voI0oDc+yT7#AZUSEc9tQ3Oes-s2Jqvsb2!Sc!6$C^S@EPFKz}>)i5JXu6 zR)J3ep9J0juf*;5-fo~|f+?A&o?t<&G9HHdn&w^>$XKynmL+Q5b^HCkQ+-6?$h7Mg z`CU`onqYSEV-Mv)=(FwNHy5lQWIeXNvBp%MBr?rRM$6i~a+!HD_W7`Jd%$eZq}d&` zA{ai^m`w&Iukd=3i9M=cZq|bA82iWFlhvydw%8`1sSgwe7m75GQnc{nrWH7n$*!F7ka_j2K=GI#CvfB>g z`2)@BB&z2?TNBi&PSDL$!LpuU3D0_tWikA;2#t0YwlXhi1)Y}I!}l(}U<)jiCw={y z%5;N7uX-mk&n-- zcC8|;(d&oOGc}{f{=S-H8H=}C^8|RXZK}%7B%>Pe2$}8nKubzb9%o}nswjdAE)6+u zLJdgcP+OYEtShC@e9_@~n5lWgyVr}5$WQEz@CpfBXb*~v&{)x4y>?Q;!v>qOZOj?j zw^738%1))sSYz_Rh93TL1r3wvt{L~KiTg;6=(D6A%8V7IB0t(4pJ7}>T0&FQTDg3c z9heIv6^)xJx(MmmWoL3V#o%pk4H@%-lh?=(+(Rd>S<@% znT8rs=gsT=ubhmhPWO71BjP)Pl{Sg4!?KQBO()wC6+KXM#8?Q%d7Z_R)@|T|C>V@MIK;*jJi)$VWCc+VO*QWZ*`Ob7`dM z%4m{kyF8j&-VX;EOv6HZq2MtZr(v)M7ukE{pRh8B|1$ zs{u<&L`z1*yAjuecz#50#+8U*fjT0HIQky?aa<5FE-R0@Nj8Ecg8`31rn~@Bp)gY- z%acsXP%%#{px5aH^ai9GyPOTkl%k0Vw%uHA&a*AEf_f50JBZm*kH@hH*<#s}2j<#> z$vmTR^-nSd?euzNoU|y#QWtbsB2~~11y+W!QAyR?nWSCgrltTf%2>S5snOM!mt z;L&a~J7F}BFuG64PliI%6U6_G6I%A(cK}O;{4AZe=##D@EoH>-M%2S$t55wIRc29&p)?sgl zcZDkSubjSbYeJXxljZ29+#l9zHVnRK>mk7I480K{3{dKnR>lEme!wv-5k@#l@=Ul9 z_j*ArY)s@dh61}ZaLUcwd7R~^=B8&|Gc3%->djo*Vi1RBf2SPc((zqWu^ojvp|mWo zfAJaf|KZ`G6t)D8*|LDW@$qdg8L`7Zd>q`obY*>oxk~ly9Wihxx3*SWSGHC*F@V_m z^Q&W)y(7BxHj0+tP2fh_U;k& z7dkvfVqNtSHV1t_LOZD^L6X9Xiy4a^RPIa2NVguDNpsr`vhwc4iQg9)D!7jQRqSIGijq+51=)bf z4#P6q#S(O3v$@X9ba%-{&cjI3VjRb3Nlp>krRi{{XBk(l{P4anQ+!!(liy*H|Nley z%scR-$^So&3;kY!fBzzI9asUDfmz@M`1NhT0qz3s1pW?x{zt$!0Rdb9R)7iM$MEZ4 z0KNg70bYlH|0m$r!1KTk@FZ{+_$&PTzW^@--v*uq?grk3Y<~mZ0R8~H2#~zr1?peT zK+QnSK+QnSK+QnSz&nqDrS;;!rS)a@cmqyQ>M9B;V#Db-PBv_@+&IY^c8C5Ra%^aa zuX7eV`oGE z+hlMW6L-X?G0zm6MSx|1NC|ea34+j^P8hKBM9c(n5Q>RY41vQ#P1*fA@S-SKjuOm6 zuodwz&fwqDKiUksvBWF-x0?(>6CMv($my_vQWTRZL;Op42-^pER6UvFJOTljBCzfs zFOD9n05FkuL@%wyk~<<^OPUA$dQYAB-FxN2)CDR z^#LcDH9=FVZsDIK2xFGfI5&%uk$?HxN_M*L% z$>Z=bAy!$rKGH#J#yfNnzrGHKfYh!!CUSKY%|)(O5f^dWbTrf&Ng3Cc_Na7)Oy@^` zjk~!Z6AA$p59E_D_m=I=A8t?g!c=5kNLlkzeVN?+gc&U0B&(T5YYa zEH2ZqkOxYyJ8-3YQ2(HdDC7wQ#c_C*hoL(&cXnpljRZ{*_`W(nqT$s@QL6DUbt@sA zzm!LRNa^Km9Z9MRjIi!dk;0?)bjd0k4h9lw#4lafQh{t+#^J!u6CK)OhJNZunelMU z48iZRw9yXTDwWahcPX%b=jb6W)G`%_c`4x}=tno{VhFVBPAwr?#JKMxv_GVv3a>rR zaqhRkBY$r8i89uvU{%OH#)(oFwI%_bxhCSxBdgm2^bdu From 90adbfc1053af56470e51f208fca7135ae269472 Mon Sep 17 00:00:00 2001 From: Ali Sabet Sarvestani Date: Thu, 28 Mar 2019 17:15:22 +0100 Subject: [PATCH 5/5] Removed fetch from pull request, was merged and unnecessary --- tensorflow/base-mpi/.Dockerfile.swo | Bin 0 -> 16384 bytes tensorflow/base-mpi/Dockerfile | 5 +---- 2 files changed, 1 insertion(+), 4 deletions(-) create mode 100644 tensorflow/base-mpi/.Dockerfile.swo diff --git a/tensorflow/base-mpi/.Dockerfile.swo b/tensorflow/base-mpi/.Dockerfile.swo new file mode 100644 index 0000000000000000000000000000000000000000..55024636c6c89bfb40b063d141e01269eee48224 GIT binary patch literal 16384 zcmeHOO>87b6>dTT0TMuf6oL4ujLka9Zg<;W?`GGMLmb=Lb+Xt$^juIxkO3efRGRe5F$b#@xAI_+hb=F zE`WNp-*$I({k;0>)vJ21+TH2Pt846xHErPcVZ(U!FV9~2?BsRhwI#z)yekq)e~X{z zIDV=U5zm~|)f|@nx~|X;{H_R{fvNuwc)YFbM1)GlJzoy(E?2?~BF|oy&bEkqo-Yoo zuSsthXc)Lb2GXQwp1IvPIXk20=tS#I_R)`Ay1~LtwuXU*frf#Gfrf#Gfrf#Gfrf#G zf&T{vlISMmOX$c=d1tor>$~<{-|F)iACB7l2Oyue=xefER$LfUCgWz;E7T7~cWD z3akUCfsX-yyTveG23`Vw3OogT4R{E+1Nhm^hVgCSTYwKt0WTsTqJYl;p9XFPo<$I4 z6*v!k3iu@ONAODAe)pXQN+y_+Y3vB*g(~5`udi|HB%TZv8zf1j=52d899q>!6qbzp zcAnqPiW?Km%zx~G)bm}oRep29hF&sYYwN2_rBNi~B+F=?UJ5!M%1k7q8t@32&F)A`N>2`FV@Rqff(kB` z9Jiqcq;aS%PD9q0(q*pb@zhV$e8#&o2$0B+@78#Q1U9q>MQSuwv{$dKSa83^rc4ua zMh;DsFu5{gDHGP3yuYP~e_X+gNpv?G_o<0HNsZ{Uqyfr|6{R9S*cqQ;TtiyItf;Ya z@iN;t7itxan<}~p>2WO|jO G0W;$sD;4#m zHEm5pji~e4>%Fh6gvVC@T9qT<+kzD~iLPZ?%dV!AX-2W=dAkc03s4ubIi{1kye#9m zte7(UBpVd#odsr0>eKZB(+C40$YUNv3EA&DpLT zhQ8#k8r$b=ZWVNA%{=5In;LApUR)cvnsF|S zG+h}@5^b03sl|Oc$Y2^4+VKSs(Kro*IV#35dsmlkul0uQ9LUi01GlTWb#_>o2GOV@ za$F5qQUY2s0^Sd}9>lXXy%|>`fCcIaFW~6A?}lMc#JH?HL(y_`s5< z-&%MyyUdIj%_EF%HXZlo$d>0c#(2vEh1;yu{#ZnkR-VKQ(=@O=S+!%Y6)m1L=d4rK zoOuD6cFo;0r#q+TNQ%rrRhx%G`{+vM9I{~%_F@@A%M<9|!C$7RrM21NF)?Ig>fovp zW#6NyVkuZu?$R+#Nrj-_hoPMgaP` zBjP+UNR>83`6En|Lw2cnOM9QZM#sS&slb0^Pp%l{f)xulh0CYBYuI0fE>@u+^2M%~ zjF{{(ETeraK@V=U*H|{)ZL*Q`Fp{(whvB^>r|`|fbXe1Kj4M`rct4aezN|ON?=Z;! zpM%f*CP4oGJ-E>CMfms61J{5RU>TSLo`YZC1uWne;AY@Y@aJCxegJ$Q_%h%D|AJ5d zPv8yUNucoaKaTr9z|a2)@I3G&Fat~jw*ha!&wm~GJg^9y1pW#?e+cXVK5#!kGQSM` z5cmf0G|>DS1{ww$1{ww$1{wzb=NMR8TV{{7;3&klqOcn_Wp3+4>yTtqEVoXumf55K z_xJ=y6f-PR)I+w0cj*rji6Ji6Xqy|`7#jle7Sb<7YBaXgBo7bfte zDeo87+nXzFb~cNaM+j+^kr{-85CMLRzm3!zKME=JLyzMBWvI&WdOV()Mz~3ZBAK10 zC=ecK6A3T!YU&{X2x*j+>?V4nN}FKEr!a1Df`F7k+SLwezE2z@jD=jrBkhfcmsdA3 zfosl@?f|y1OA?Wq5kfB54o8S)a}pI>SOi}XP@&*D1tU0I-I(2_BRGnp?kK?=L~RjR z;|z{6{q?8U4<%mFAAvGNUwAlTKBrR$N>NP441qM^ApY;+Q4M5@gAGJ-@(8_qcYcOZ zMUqLg4jCb1ae0|pI8&&N8q=y+6{wwav~bEH;693|)4S@19r6f69tgGYHsbb$f4I<=H>t!cq@g-qv1k2BA8K}Hl! z${)zRVfK;j%^z$pcO$ckbFs3sF5O~nET^u7Q-rX%xkxu7`tNL!R9y9|0}ndyo>ch1 zwJ3NsYxWKn6d>y(O#z&!v87APi;Tiuns$VzGck{xu=Q~$PxbcBRkTV?=_y}j$H71( zvvDn4*U}=HGW17g8tKp$%NQe9y2yAiW`^K*Q92t^JpiTqU053Z&eEfj>ryNb^HRc! zV1W8jAIxdjomxV)h;i3NXupg%RN=M9IL`eRc;KF%d%TFXDHsECk8z^ZN3D@Zr*1*s z5Nc#NMMVUcebL>XvCg!Lwh|!@4XI@UQ+#dz)W}#dsEhRp3CqP*2d5s0MmRz#L{k

IZVPe F{td@_YAyf( literal 0 HcmV?d00001 diff --git a/tensorflow/base-mpi/Dockerfile b/tensorflow/base-mpi/Dockerfile index 0ff3af4..36f39ef 100644 --- a/tensorflow/base-mpi/Dockerfile +++ b/tensorflow/base-mpi/Dockerfile @@ -95,10 +95,7 @@ RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8 # clone tensorflow repo WORKDIR /tmp -RUN git clone https://github.com/tensorflow/tensorflow.git && \ - cd tensorflow && \ - git fetch origin +refs/pull/26720/merge && \ - git checkout FETCH_HEAD +RUN git clone https://github.com/tensorflow/tensorflow.git # configure for CUDA 9.0 with mpi WORKDIR /tmp/tensorflow