-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDockerfile_Nvidia_T4_GPU
189 lines (164 loc) · 7.26 KB
/
Dockerfile_Nvidia_T4_GPU
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# Dockerfile for building a GPU-enabled environment for the aperi-mech project
# CUDA Architecture: 75 (Tesla T4)
# This Dockerfile sets up a python and spack environment with necessary packages
# After building the image, the user can run the container and start working on the aperi-mech project.
# Building the project:
# 1. This assume the user has the aperi-mech project cloned in the same directory as the Dockerfile
# 2. Install prerequisites:
# - Docker
# - Nvidia drivers:
# sudo apt-get update
# sudo apt-get install -y ubuntu-drivers-common
# sudo ubuntu-drivers autoinstall
# nvidia-smi # Verify the driver installation, probably need to restart the system after the last command
# - Nvidia Container Toolkit:
# # Add the package repositories
# distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
# curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
# curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
#
# # Update the package lists
# sudo apt-get update
#
# # Install the NVIDIA Container Toolkit
# sudo apt-get install -y nvidia-docker2
#
# # Restart the Docker daemon to complete the installation
# sudo systemctl restart docker
# - Probably others. TODO(jake) add more details the next time I set this up
# 3. Build the docker image using the following command:
# docker build -t aperi-mech-gpu:latest -f Dockerfile_Nvidia_T4_GPU . 2>&1 | tee build.log
# 4. Run the docker container using the following command (uses the docker-compose.yml file in the aperi-mech project):
# docker-compose -f docker-compose_nvidia_t4_gpu.yml run --service-ports aperi-mech-gpu-development /bin/bash
# Note: It is important that the drivers are the same in the image and the host system. If the there are problems with the drivers, the container will not be able to access the GPU.
# It is quicker to reinstall the drivers on the host system than to rebuild the image. Try the following commands:
# sudo apt-get update
# sudo apt-get upgrade
# sudo reboot
# sudo apt-get install linux-headers-$(uname -r)
# sudo apt-get purge nvidia*
# sudo apt-get install nvidia-driver-XXX # (XXX is the version number, make sure the full version number is the same as the one in the image, checked via nvidia-smi)
# sudo reboot
# # Reinstall the Nvidia Container Toolkit per the instructions above
# 5. Start working on the aperi-mech project
# - Configure the project
# ./do_configure --gpu
# - Build the project:
# cd build/Release_gpu
# make -j 4
# - Run the project unit tests:
# make run_all_unit_tests
# - Run the project regression tests:
# TODO(jake) implement: make run_all_regression_tests
# - Run the project performance tests:
# TODO(jake) implement: make run_all_performance_tests
# Base image with CUDA support
FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04
# Avoid prompts from apt
ENV DEBIAN_FRONTEND=noninteractive
# Set CUDA architecture, 75 is for Tesla T4
ENV CUDA_ARCH=75
#################### System Packages from apt ####################
# Install necessary packages
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
coreutils \
curl \
environment-modules \
file \
gfortran \
git \
git-lfs \
gpg \
lcov \
libcurl4-openssl-dev \
libgl1 \
libglu1-mesa \
libssl-dev \
lsb-release \
openssl \
python3 \
python3-distutils \
python3-venv \
python3-pip \
sudo \
unzip \
vim \
xorg \
zip \
&& rm -rf /var/lib/apt/lists/*
# Install NVIDIA utilities (nvidia-smi)
RUN apt-get update && apt-get install -y --no-install-recommends \
nvidia-utils-535
#################### User Setup ####################
# Create a non-root user
RUN useradd -m aperi-mech_docker
# Switch back to root user
USER root
# Configure passwordless sudo for the user
RUN echo "aperi-mech_docker ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
# Change to the new user
USER aperi-mech_docker
# Make the working directory
WORKDIR /home/aperi-mech_docker
# Set the HOME environment variable
ENV HOME=/home/aperi-mech_docker
#################### Python Packages ####################
# Install python packages using pip
RUN pip3 install --no-input --no-cache-dir \
pytest==8.3.2 \
testbook==0.4.2 \
jupyter==1.0.0 \
jupyterlab==4.2.4 \
numpy==2.0.1 \
scipy==1.14.0 \
matplotlib==3.9.1 \
ipykernel==6.29.5 \
meshio==5.3.5 \
gmsh==4.13.1 \
netCDF4==1.7.1.post1 \
pandas==2.2.3 \
&& rm -rf ~/.cache/pip
# Add environment to Jupyter
RUN python3 -m ipykernel install --user --name aperi-mech --display-name "aperi-mech"
# Add .local/bin to the PATH
ENV PATH="${HOME}/.local/bin:$PATH"
#################### Spack Installation and Setup ####################
# Clone Spack repo
RUN git clone -c feature.manyFiles=true https://github.com/spack/spack.git ${HOME}/spack
# Set up Spack environment
ENV SPACK_ROOT=${HOME}/spack
ENV PATH="$SPACK_ROOT/bin:$PATH"
RUN . $SPACK_ROOT/share/spack/setup-env.sh
# Find compilers and externals for Spack
RUN spack compiler find && \
spack external find
# Create a new Spack environment, cpu build
RUN spack env create aperi-mech
# Add packages to the Spack environment, aperi-mech
RUN . $SPACK_ROOT/share/spack/setup-env.sh && \
spack -e aperi-mech add compadre@master ~tests && \
spack -e aperi-mech add [email protected] +cuda ~shared cuda_arch=${CUDA_ARCH} && \
spack -e aperi-mech add [email protected] +cuda +cuda_lambda +cuda_relocatable_device_code ~cuda_uvm ~shared +wrapper cxxstd=17 cuda_arch=${CUDA_ARCH} && \
spack -e aperi-mech add trilinos@master ~amesos ~amesos2 ~anasazi ~aztec ~belos ~epetra ~epetraext ~ifpack ~ifpack2 ~ml ~muelu ~sacado ~shared +cuda +cuda_rdc +exodus +gtest +hdf5 +stk +zoltan +zoltan2 cxxstd=17 cuda_arch=${CUDA_ARCH} && \
spack -e aperi-mech add [email protected] && \
spack -e aperi-mech add [email protected] && \
spack -e aperi-mech add eigen@master
# eigen@master is used as [email protected] has a lot of GPU related warnings
# Install Packages, aperi-mech
RUN . $SPACK_ROOT/share/spack/setup-env.sh && \
spack -e aperi-mech install --fresh
# Create a new Spack environment, for seacas. Want seacas without mpi, which causes conflicts with trilinos
RUN spack env create seacas
# Add packages to the Spack environment, seacas
RUN . $SPACK_ROOT/share/spack/setup-env.sh && \
spack -e seacas add openmpi && \
spack -e seacas add seacas ~mpi ~tests ~x11
# Install Packages, seacas
RUN . $SPACK_ROOT/share/spack/setup-env.sh && \
spack -e seacas install --fresh
# Add the spack source command to the bashrc
RUN echo "source ${SPACK_ROOT}/share/spack/setup-env.sh" >> ${HOME}/.bashrc
# HEALTHCHECK to verify Spack and Python availability
HEALTHCHECK --interval=1m --timeout=10s --start-period=5s --retries=3 CMD /bin/bash -c "source ${SPACK_ROOT}/share/spack/setup-env.sh && python3 --version || exit 1"