23.08 Release preparation (#71)

* add dgl internal repo as a submodule * update Dockerfile to use latest pyt staged image, updated dgl install and fix torch-harmonics stage * update nvfuser API * update dgl submodule * updates * update paths * add vtk and pyvista * revert Dockerfile changes, update the package to new version * update the decorator version for onnx * fix typo * fix security issues in filesystem.py * remove dgl as modulus core submodule * update DGL build * move some packages to Dockerfile * update * update tensorly installs * add more arch support * update python version * add recursive option * update Dockerfile * update * add test for http package * update ci tensorflow version * update changelog
NVIDIA · Jul 21, 2023 · 0e5defb · 0e5defb
1 parent 25287d5
commit 0e5defb
Show file tree

Hide file tree

Showing 10 changed files with 103 additions and 59 deletions.
diff --git a/.gitmodules b/.gitmodules
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,9 +11,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 
 - Added a CHANGELOG.md
+- Added build support for internal DGL
 
 ### Changed
 
+- DGL install changed from pypi to source
+
 ### Deprecated
 
 ### Removed
@@ -24,8 +27,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Security
 
+- Fixed security issues with subprocess and urllib in `filesystem.py`
+
 ### Dependencies
 
+- Updated the base container to latest PyTorch base container which is based on torch 2.0
+- Container now supports CUDA 12, Python 3.10
+
 ## [0.1.0] - 2023-05-08
 
 ### Added

diff --git a/Dockerfile b/Dockerfile
@@ -12,72 +12,85 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG PYT_VER=22.12
+ARG PYT_VER=23.06
 FROM nvcr.io/nvidia/pytorch:$PYT_VER-py3 as builder
 
 # Update pip and setuptools
 RUN pip install --upgrade pip setuptools  
 
-# Setup git lfs
+# Setup git lfs, graphviz gl1(vtk dep)
 RUN apt-get update && \
-    apt-get install -y git-lfs && \
+    apt-get install -y git-lfs graphviz libgl1 && \
     git lfs install
 
-# Install nightly build of dgl
-RUN pip install --no-deps --pre dgl -f https://data.dgl.ai/wheels/cu117/repo.html
-RUN pip install --no-deps --pre dglgo -f https://data.dgl.ai/wheels-test/repo.html
-ENV DGLBACKEND=pytorch
-
 ENV _CUDA_COMPAT_TIMEOUT=90
 
+# TODO remove benchy dependency
+RUN pip install git+https://github.com/romerojosh/benchy.git
+# TODO use torch-harmonics pip package after the upgrade
+RUN pip install https://github.com/NVIDIA/torch-harmonics/archive/8826246cacf6c37b600cdd63fde210815ba238fd.tar.gz
+RUN pip install "tensorly>=0.8.1" "vtk>=9.2.6" "pyvista>=0.40.1" https://github.com/tensorly/torch/archive/715a0daa7ae0cbdb443d06780a785ae223108903.tar.gz
+
+# Install DGL (Internal if present otherwise from source)
+ARG DGL_BACKEND=pytorch
+ENV DGL_BACKEND=$DGL_BACKEND
+ENV DGLBACKEND=$DGL_BACKEND
+
+COPY . /modulus/
+RUN if [ -e "/modulus/deps/dgl" ]; then \
+	echo "Internal DGL exists. Using internal DGL build" && \
+	cp -r /modulus/deps/dgl/ /opt/ && \
+	mkdir /opt/dgl/dgl-source/build \
+	&& cd /opt/dgl/dgl-source/build \
+	&& export NCCL_ROOT=/usr \
+	&& cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release \
+        	-DUSE_CUDA=ON -DCUDA_ARCH_BIN="60 70 75 80 86 90" -DCUDA_ARCH_PTX="90" \
+        	-DCUDA_ARCH_NAME="Manual" \
+        	-DBUILD_TORCH=ON \
+        	-DBUILD_SPARSE=ON \
+	&& cmake --build . \
+	&& cd ../python \
+	&& python setup.py bdist_wheel \
+	&& pip install ./dist/dgl*.whl \
+	&& rm -rf ./dist \
+	&& rm -rf ../build \
+	&& cd /opt/dgl/ \
+	&& pip install --no-cache-dir -r requirements.txt; \
+    else \
+	echo "No Internal DGL present. Building from source" && \
+	git clone --recurse-submodules https://github.com/dmlc/dgl.git && \
+	cd dgl/ && DGL_HOME="/workspace/dgl/" bash script/build_dgl.sh -g && \
+	cd python && \
+	python setup.py install && \
+	python setup.py build_ext --inplace; \
+    fi
+
+# cleanup of stage
+RUN rm -rf /modulus/ 
+
 # Install custom onnx
 # TODO: Find a fix to eliminate the custom build
 # Forcing numpy update to over ride numba 0.56.4 max numpy constraint
 COPY . /modulus/ 
-RUN if [ -e "/modulus/deps/onnxruntime_gpu-1.14.0-cp38-cp38-linux_x86_64.whl" ]; then \
+RUN if [ -e "/modulus/deps/onnxruntime_gpu-1.15.1-cp310-cp310-linux_x86_64.whl" ]; then \
 	echo "Custom wheel exists, installing!" && \
-	pip install --force-reinstall /modulus/deps/onnxruntime_gpu-1.14.0-cp38-cp38-linux_x86_64.whl; \
+	pip install --force-reinstall /modulus/deps/onnxruntime_gpu-1.15.1-cp310-cp310-linux_x86_64.whl; \
     else \
 	echo "No custom wheel present, skipping" && \
-	pip install numpy==1.22.4; \
+	pip install "numpy==1.22.4"; \
     fi
 # cleanup of stage
 RUN rm -rf /modulus/ 
 
 # CI image
 FROM builder as ci
-RUN pip install tensorflow>=2.11.0 warp-lang>=0.6.0 black==22.10.0 interrogate==1.5.0 coverage==6.5.0 protobuf==3.20.0 
-# TODO remove benchy dependency
-RUN pip install git+https://github.com/romerojosh/benchy.git
-# TODO use torch-harmonics pip package after the upgrade
-RUN pip install https://github.com/NVIDIA/torch-harmonics/archive/8826246cacf6c37b600cdd63fde210815ba238fd.tar.gz
-
-# install libcugraphops and pylibcugraphops
-ENV DEBIAN_FRONTEND=noninteractive
-ENV TZ=Etc/UTC
-RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
-
-RUN apt-get update &&\
-    apt-get install -y software-properties-common &&\
-    add-apt-repository ppa:ubuntu-toolchain-r/test &&\
-    apt-get install -y libstdc++6
-RUN mkdir -p /opt/cugraphops &&\
-    cd /opt/cugraphops &&\
-    wget https://anaconda.org/nvidia/libcugraphops/23.04.00/download/linux-64/libcugraphops-23.04.00-cuda11_230412_ga76892e3_0.tar.bz2 &&\
-    wget https://anaconda.org/nvidia/pylibcugraphops/23.04.00/download/linux-64/pylibcugraphops-23.04.00-cuda11_py38_230412_ga76892e3_0.tar.bz2 &&\
-    tar -xf libcugraphops-23.04.00-cuda11_230412_ga76892e3_0.tar.bz2 &&\
-    tar -xf pylibcugraphops-23.04.00-cuda11_py38_230412_ga76892e3_0.tar.bz2 &&\
-    rm libcugraphops-23.04.00-cuda11_230412_ga76892e3_0.tar.bz2 &&\
-    rm pylibcugraphops-23.04.00-cuda11_py38_230412_ga76892e3_0.tar.bz2
-
-ENV PYTHONPATH="${PYTHONPATH}:/opt/cugraphops/lib/python3.8/site-packages"
-
+RUN pip install "tensorflow>=2.9.0" "warp-lang>=0.6.0" "black==22.10.0" "interrogate==1.5.0" "coverage==6.5.0" "protobuf==3.20.0" 
 COPY . /modulus/
 RUN cd /modulus/ && pip install -e . && rm -rf /modulus/
 
 # Deployment image
 FROM builder as deploy
-RUN pip install protobuf==3.20.0 
+RUN pip install "protobuf==3.20.0"
 COPY . /modulus/
 RUN cd /modulus/ && pip install .
 
@@ -87,6 +100,6 @@ RUN rm -rf /modulus/
 # Docs image
 FROM deploy as docs
 # Install CI packages
-RUN pip install tensorflow>=2.11.0 warp-lang>=0.6.0 protobuf==3.20.0
+RUN pip install "tensorflow>=2.9.0" "warp-lang>=0.6.0" "protobuf==3.20.0"
 # Install packages for Sphinx build
-RUN pip install recommonmark==0.7.1 sphinx==5.1.1 sphinx-rtd-theme==1.0.0 pydocstyle==6.1.1 nbsphinx==0.8.9 nbconvert==6.4.3 jinja2==3.0.3
+RUN pip install "recommonmark==0.7.1" "sphinx==5.1.1" "sphinx-rtd-theme==1.0.0" "pydocstyle==6.1.1" "nbsphinx==0.8.9" "nbconvert==6.4.3" "jinja2==3.0.3"
diff --git a/modulus/models/layers/fused_silu.py b/modulus/models/layers/fused_silu.py
@@ -15,7 +15,7 @@
 import functools
 import torch
 from torch.autograd import Function
-from torch._C._nvfuser import Fusion, FusionDefinition, DataType
+from nvfuser._C import Fusion, FusionDefinition, DataType
 
 
 _torch_dtype_to_nvfuser = {

diff --git a/modulus/utils/filesystem.py b/modulus/utils/filesystem.py
@@ -19,7 +19,7 @@
 import urllib.request
 import os
 import hashlib
-import subprocess
+import requests
 
 import logging
 
@@ -28,7 +28,7 @@
 try:
     LOCAL_CACHE = os.environ["LOCAL_CACHE"]
 except KeyError:
-    LOCAL_CACHE = os.environ["HOME"] + "/.cache/modulus"
+    LOCAL_CACHE = os.environ["HOME"] + "/.cache"
 
 
 def _cache_fs(fs):
@@ -55,15 +55,16 @@ def _download_cached(path: str, recursive: bool = False) -> str:
     if not os.path.exists(cache_path):
         logger.debug("Downloading %s to cache: %s", path, cache_path)
         if path.startswith("s3://"):
-            if recursive:
-                subprocess.check_call(
-                    ["aws", "s3", "cp", path, cache_path, "--recursive"]
-                )
-            else:
-                subprocess.check_call(["aws", "s3", "cp", path, cache_path])
+            fs = _get_fs(path)
+            fs.get(path, cache_path, recursive=recursive)
         elif url.scheme == "http":
+            # urllib.request.urlretrieve(path, cache_path)
             # TODO: Check if this supports directory fetches
-            urllib.request.urlretrieve(path, cache_path)
+            response = requests.get(path, stream=True, timeout=5)
+            with open(cache_path, "wb") as output:
+                for chunk in response.iter_content(chunk_size=8192):
+                    if chunk:
+                        output.write(chunk)
         elif url.scheme == "file":
             path = os.path.join(url.netloc, url.path)
             return path

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ authors = [
 ]
 description = "A deep learning framework for AI-driven multi-physics systems"
 readme = "README.md"
-requires-python = ">=3.7"
+requires-python = ">=3.8"
 license = {text = "Apache 2.0"}
 dependencies = [
     "h5py>=3.7.0",
@@ -20,8 +20,6 @@ dependencies = [
     "pytest>=6.0.0",
     "ruamel.yaml>=0.17.22",
     "setuptools>=67.6.0",
-    "tensorly>=0.8.1",
-    "tensorly-torch>=0.4.0",
     "torch>=1.12",
     "xarray>=2023.1.0",
     "zarr>=2.14.2",

diff --git a/test/deploy/test_onnx_fft.py b/test/deploy/test_onnx_fft.py
@@ -41,10 +41,10 @@ def check_ort_version():
             True,
             reason="Proper ONNX runtime is not installed. 'pip install onnxruntime onnxruntime_gpu'",
         )
-    elif ort.__version__ != "1.14.0":
+    elif ort.__version__ != "1.15.1":
         return pytest.mark.skipif(
             True,
-            reason="Must install custom ORT 1.14.0. Other versions do not work \
+            reason="Must install custom ORT 1.15.1. Other versions do not work \
         due to bug in IRFFT: https://github.com/microsoft/onnxruntime/issues/13236",
         )
     else:

diff --git a/test/deploy/test_onnx_utils.py b/test/deploy/test_onnx_utils.py
@@ -38,10 +38,10 @@ def check_ort_version():
             True,
             reason="Proper ONNX runtime is not installed. 'pip install onnxruntime onnxruntime_gpu'",
         )
-    elif ort.__version__ != "1.14.0":
+    elif ort.__version__ != "1.15.1":
         return pytest.mark.skipif(
             True,
-            reason="Must install custom ORT 1.14.0. Other versions do not work \
+            reason="Must install custom ORT 1.15.1. Other versions do not work \
         due to bug in IRFFT: https://github.com/microsoft/onnxruntime/issues/13236",
         )
     else:

diff --git a/test/models/common/inference.py b/test/models/common/inference.py
@@ -38,10 +38,10 @@ def check_ort_version():
             True,
             reason="Proper ONNX runtime is not installed. 'pip install onnxruntime onnxruntime_gpu'",
         )
-    elif ort.__version__ != "1.14.0":
+    elif ort.__version__ != "1.15.1":
         return pytest.mark.skipif(
             True,
-            reason="Must install custom ORT 1.14.0. Other versions do not work \
+            reason="Must install custom ORT 1.15.1. Other versions do not work \
         due to bug in IRFFT: https://github.com/microsoft/onnxruntime/issues/13236",
         )
     else:

diff --git a/test/utils/test_filesystem.py b/test/utils/test_filesystem.py
@@ -12,10 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import hashlib
 from pathlib import Path
 from modulus.utils import filesystem
 
 
+def calculate_checksum(file_path):
+    sha256 = hashlib.sha256()
+
+    with open(file_path, "rb") as f:
+        while True:
+            data = f.read(8192)
+            if not data:
+                break
+            sha256.update(data)
+
+    calculated_checksum = sha256.hexdigest()
+    return calculated_checksum
+
+
 def test_package(tmp_path: Path):
     string = "hello"
     afile = tmp_path / "a.txt"
@@ -28,3 +43,12 @@ def test_package(tmp_path: Path):
         ans = f.read()
 
     assert ans == string
+
+
+def test_http_package():
+    test_url = "http://raw.githubusercontent.com/NVIDIA/modulus/main/docs/img"
+    package = filesystem.Package(test_url, seperator="/")
+    path = package.get("modulus-pipes.jpg")
+
+    known_checksum = "e075b2836d03f7971f754354807dcdca51a7875c8297cb161557946736d1f7fc"
+    assert calculate_checksum(path) == known_checksum