From 932bdeb0b529643811dd378584a4ed7c97b80b17 Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Thu, 9 Jun 2022 13:04:57 +0000 Subject: [PATCH 01/45] Upgrade to TensorFlow 2.9 DO_NOT_SUBMIT: Wait until new base image with TensorFlow 2.9.1 is out. http://b/207851560 --- Dockerfile.tmpl | 14 +++++++++----- config.txt | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 0210890a..7b9ca54c 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -26,7 +26,8 @@ RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/lib FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} {{ end }} # Keep these variables in sync if base image is updated. -ENV TENSORFLOW_VERSION=2.6.4 +# TODO(rosbo): Still 2.9.0 rc2. Wait for the 2.9.1 +ENV TENSORFLOW_VERSION=2.9.0 # We need to redefine the ARG here to get the ARG value defined above the FROM instruction. # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact @@ -132,7 +133,8 @@ RUN pip install jax[cpu] && \ # Install mxnet {{ if eq .Accelerator "gpu" }} -RUN pip install mxnet-cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \ +# No specific package for 11.3 minor versions, using 11.2 instead. +RUN pip install mxnet-cu$112 && \ /tmp/clean-layer.sh {{ else }} RUN pip install mxnet && \ @@ -154,7 +156,8 @@ RUN pip install spacy && \ RUN pip install pycuda && \ pip install pynvrtc && \ pip install pynvml && \ - pip install nnabla-ext-cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \ + # No specific package for 11.x minor versions. + pip install nnabla-ext-cuda$110 && \ /tmp/clean-layer.sh {{ end }} @@ -167,8 +170,9 @@ RUN pip install pysal && \ # Use `conda install -c h2oai h2o` once Python 3.7 version is released to conda. apt-get install -y default-jre-headless && \ pip install -f https://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o && \ - pip install tensorflow-gcs-config==2.6.0 && \ - pip install tensorflow-addons==0.14.0 && \ + pip install tensorflow-gcs-config==${TENSORFLOW_VERSION} && \ + # TODO(b/207851560) Upgrade to 0.17.1 once the base image with TensorFlow 2.9.1 is out. + pip install tensorflow-addons==0.17.0 && \ /tmp/clean-layer.sh RUN apt-get install -y libfreetype6-dev && \ diff --git a/config.txt b/config.txt index 226e6f9e..e6a1b0b8 100644 --- a/config.txt +++ b/config.txt @@ -1,7 +1,7 @@ BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release BASE_IMAGE_TAG=m94 -CPU_BASE_IMAGE_NAME=tf2-cpu.2-6 -GPU_BASE_IMAGE_NAME=tf2-gpu.2-6 +CPU_BASE_IMAGE_NAME=tf2-cpu.2-9 +GPU_BASE_IMAGE_NAME=tf2-gpu.2-9 LIGHTGBM_VERSION=3.3.2 TORCH_VERSION=1.11.0 # TODO(b/215031404#comment4) Remove zlib sed command after upgrade to >= 0.11.1 From 019234e28b6be332aec12cda11cbcadc159b83fb Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Wed, 6 Jul 2022 21:56:00 +0000 Subject: [PATCH 02/45] Upgrade PyTorch to 1.12 http://b/238238619 --- config.txt | 9 ++++----- packages/torch.Dockerfile | 4 +--- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/config.txt b/config.txt index e6a1b0b8..83d4142c 100644 --- a/config.txt +++ b/config.txt @@ -3,10 +3,9 @@ BASE_IMAGE_TAG=m94 CPU_BASE_IMAGE_NAME=tf2-cpu.2-9 GPU_BASE_IMAGE_NAME=tf2-gpu.2-9 LIGHTGBM_VERSION=3.3.2 -TORCH_VERSION=1.11.0 -# TODO(b/215031404#comment4) Remove zlib sed command after upgrade to >= 0.11.1 -TORCHAUDIO_VERSION=0.11.0 -TORCHTEXT_VERSION=0.12.0 -TORCHVISION_VERSION=0.12.0 +TORCH_VERSION=1.12.0 +TORCHAUDIO_VERSION=0.12.0 +TORCHTEXT_VERSION=0.13.0 +TORCHVISION_VERSION=0.13.0 CUDA_MAJOR_VERSION=11 CUDA_MINOR_VERSION=0 diff --git a/packages/torch.Dockerfile b/packages/torch.Dockerfile index 0843082b..e1c00fa0 100644 --- a/packages/torch.Dockerfile +++ b/packages/torch.Dockerfile @@ -13,7 +13,7 @@ ARG CUDA_MINOR_VERSION RUN test -n "$TORCHVISION_VERSION" # Build instructions: https://github.com/pytorch/pytorch#from-source -RUN conda install astunparse numpy ninja pyyaml mkl mkl-include setuptools==59.5.0 cmake cffi typing_extensions future six requests dataclasses +RUN conda install astunparse numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses RUN conda install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} # By default, it uses the version from version.txt which includes the `a0` (alpha zero) suffix and part of the git hash. @@ -44,8 +44,6 @@ RUN cd /usr/local/src && \ git checkout tags/v$TORCHAUDIO_VERSION && \ git submodule sync && \ git submodule update --init --recursive --jobs 0 && \ - # TODO(b/215031404#comment4) Remove after upgrade next release (0.11.1) - sed -i s?https://zlib.net/zlib-1.2.11.tar.gz?https://sourceforge.net/projects/libpng/files/zlib/1.2.11/zlib-1.2.11.tar.gz? third_party/zlib/CMakeLists.txt && \ python setup.py bdist_wheel # Build torchtext From 9fda5076abc31ddcfc0b8369ee0ffa67fe2511c7 Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Wed, 3 Aug 2022 17:54:06 +0000 Subject: [PATCH 03/45] Set CUDA_MINOR_VERSION to 3 --- config.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.txt b/config.txt index 83d4142c..da2e96ec 100644 --- a/config.txt +++ b/config.txt @@ -8,4 +8,4 @@ TORCHAUDIO_VERSION=0.12.0 TORCHTEXT_VERSION=0.13.0 TORCHVISION_VERSION=0.13.0 CUDA_MAJOR_VERSION=11 -CUDA_MINOR_VERSION=0 +CUDA_MINOR_VERSION=3 From 4bd449696617f5c73a97aa81e9e9f3c457934a42 Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Wed, 3 Aug 2022 21:54:54 +0000 Subject: [PATCH 04/45] Add conda libs to LD_LIBRARY_PATH --- Dockerfile.tmpl | 3 +++ packages/torch.Dockerfile | 3 +++ 2 files changed, 6 insertions(+) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 7b9ca54c..fc5de57c 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -8,6 +8,9 @@ ARG TORCHAUDIO_VERSION ARG TORCHTEXT_VERSION ARG TORCHVISION_VERSION +# Ensures shared libraries installed with conda can be found by the dynamic link loader. +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" + {{ if eq .Accelerator "gpu" }} FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl diff --git a/packages/torch.Dockerfile b/packages/torch.Dockerfile index e1c00fa0..4d86b00f 100644 --- a/packages/torch.Dockerfile +++ b/packages/torch.Dockerfile @@ -21,6 +21,9 @@ RUN conda install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION ENV PYTORCH_BUILD_VERSION=$PACKAGE_VERSION ENV PYTORCH_BUILD_NUMBER=1 +# Ensures shared libraries installed with conda can be found by the dynamic link loader. +# For PyTorch, we need specifically mkl. +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" ENV TORCH_CUDA_ARCH_LIST="3.7;6.0;7.0+PTX;7.5+PTX" ENV FORCE_CUDA=1 RUN cd /usr/local/src && \ From 4896dad6aa056463298eb17aa2c2c1cb52533eb2 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Wed, 10 Aug 2022 15:54:25 -0400 Subject: [PATCH 05/45] Use DLVM image m95 --- config.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.txt b/config.txt index da2e96ec..722540f7 100644 --- a/config.txt +++ b/config.txt @@ -1,5 +1,5 @@ BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release -BASE_IMAGE_TAG=m94 +BASE_IMAGE_TAG=m95 CPU_BASE_IMAGE_NAME=tf2-cpu.2-9 GPU_BASE_IMAGE_NAME=tf2-gpu.2-9 LIGHTGBM_VERSION=3.3.2 From 49aafb71423f6fe6148389b65e733087e6dcf786 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Wed, 10 Aug 2022 15:55:10 -0400 Subject: [PATCH 06/45] The tag name is m95_release for some reason --- config.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.txt b/config.txt index 722540f7..72f7847b 100644 --- a/config.txt +++ b/config.txt @@ -1,5 +1,5 @@ BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release -BASE_IMAGE_TAG=m95 +BASE_IMAGE_TAG=m95_release CPU_BASE_IMAGE_NAME=tf2-cpu.2-9 GPU_BASE_IMAGE_NAME=tf2-gpu.2-9 LIGHTGBM_VERSION=3.3.2 From 776a50a296e0b017988a62cc84d5014831c1b47e Mon Sep 17 00:00:00 2001 From: Dustin H Date: Wed, 10 Aug 2022 17:10:43 -0400 Subject: [PATCH 07/45] Reorder FROM & ENV so that build works correctly --- Dockerfile.tmpl | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 0906b0f5..3222009f 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -8,13 +8,18 @@ ARG TORCHAUDIO_VERSION ARG TORCHTEXT_VERSION ARG TORCHVISION_VERSION -# Ensures shared libraries installed with conda can be found by the dynamic link loader. -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" - {{ if eq .Accelerator "gpu" }} FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl FROM ${BASE_IMAGE_REPO}/${GPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} +{{ else }} +FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} +{{ end }} + +# Ensures shared libraries installed with conda can be found by the dynamic link loader. +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" + +{{ if eq .Accelerator "gpu" }} ARG CUDA_MAJOR_VERSION ARG CUDA_MINOR_VERSION ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} @@ -25,12 +30,10 @@ ENV PATH=/opt/bin:${PATH} ENV LD_LIBRARY_PATH_NO_STUBS="$LD_LIBRARY_PATH" ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs" RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 -{{ else }} -FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} {{ end }} + # Keep these variables in sync if base image is updated. -# TODO(rosbo): Still 2.9.0 rc2. Wait for the 2.9.1 -ENV TENSORFLOW_VERSION=2.9.0 +ENV TENSORFLOW_VERSION=2.9.1 # We need to redefine the ARG here to get the ARG value defined above the FROM instruction. # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact From d674a42d6188bec2de6d03be88f43c0e4e9dbaa1 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 15 Aug 2022 13:30:37 -0400 Subject: [PATCH 08/45] They fixed the tag to m95 --- config.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.txt b/config.txt index 72f7847b..722540f7 100644 --- a/config.txt +++ b/config.txt @@ -1,5 +1,5 @@ BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release -BASE_IMAGE_TAG=m95_release +BASE_IMAGE_TAG=m95 CPU_BASE_IMAGE_NAME=tf2-cpu.2-9 GPU_BASE_IMAGE_NAME=tf2-gpu.2-9 LIGHTGBM_VERSION=3.3.2 From 8356a51849d7b631473f117079037d7fa1f07b6f Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 15 Aug 2022 21:32:40 -0400 Subject: [PATCH 09/45] Use LIBRARY_PATH instead of LD_LIBRARY_PATH for linking --- Dockerfile.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 3222009f..57196615 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -17,7 +17,7 @@ FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} {{ end }} # Ensures shared libraries installed with conda can be found by the dynamic link loader. -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" +ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" {{ if eq .Accelerator "gpu" }} ARG CUDA_MAJOR_VERSION From 855fe8f99f831ae35387c88112345787bdba15cd Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 15 Aug 2022 22:15:30 -0400 Subject: [PATCH 10/45] We need both LIBRARY_PATH and LD_LIBRARY_PATH --- Dockerfile.tmpl | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 57196615..4ad2a088 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -18,6 +18,7 @@ FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} # Ensures shared libraries installed with conda can be found by the dynamic link loader. ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" {{ if eq .Accelerator "gpu" }} ARG CUDA_MAJOR_VERSION From 85e1c423ca1f42477167e5d8575966a29e2e5a52 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 15 Aug 2022 22:16:48 -0400 Subject: [PATCH 11/45] LIBRARY_PATH & LD_LIBRARY_PATH for linking --- packages/torch.Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/torch.Dockerfile b/packages/torch.Dockerfile index 4d86b00f..b9e0f30e 100644 --- a/packages/torch.Dockerfile +++ b/packages/torch.Dockerfile @@ -23,6 +23,7 @@ ENV PYTORCH_BUILD_NUMBER=1 # Ensures shared libraries installed with conda can be found by the dynamic link loader. # For PyTorch, we need specifically mkl. +ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" ENV TORCH_CUDA_ARCH_LIST="3.7;6.0;7.0+PTX;7.5+PTX" ENV FORCE_CUDA=1 @@ -82,4 +83,4 @@ COPY --from=builder /usr/local/src/text/dist/*.whl /tmp/whl COPY --from=builder /usr/local/src/vision/dist/*.whl /tmp/whl # Print out the built .whl file. -RUN ls -lh /tmp/whl/ \ No newline at end of file +RUN ls -lh /tmp/whl/ From fb69472daafdb35ea5b3b274d838318993070f94 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 16 Aug 2022 10:47:42 -0400 Subject: [PATCH 12/45] ncurses.h is required to install torch audio --- packages/torch.Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/torch.Dockerfile b/packages/torch.Dockerfile index b9e0f30e..e9d2c167 100644 --- a/packages/torch.Dockerfile +++ b/packages/torch.Dockerfile @@ -42,7 +42,10 @@ RUN pip install /usr/local/src/pytorch/dist/*.whl # Instructions: https://github.com/pytorch/audio#from-source # See comment above for PYTORCH_BUILD_VERSION. ENV BUILD_VERSION=$TORCHAUDIO_VERSION -RUN cd /usr/local/src && \ +RUN sudo apt-get update && \ + # ncurses.h is required for this install + sudo apt-get install libncurses-dev && \ + cd /usr/local/src && \ git clone https://github.com/pytorch/audio && \ cd audio && \ git checkout tags/v$TORCHAUDIO_VERSION && \ From 74046c252cd94e7fbae2768e6d73c355ac47dc99 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 16 Aug 2022 17:10:19 -0400 Subject: [PATCH 13/45] Bump cudf/cuml to 21.12 This is the newest version that supports both CUDA 11.X & Python 3.7 --- Dockerfile.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 4ad2a088..0ae3d01a 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -89,7 +89,7 @@ RUN conda config --add channels nvidia && \ # b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version. RUN pip uninstall -y pyarrow && \ - conda install cudf=21.10 cuml=21.10 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \ + conda install cudf=21.12 cuml=21.12 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \ /tmp/clean-layer.sh {{ end }} From ba6160ceaea5112d19a73760fec2466dac61003c Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 16 Aug 2022 17:50:50 -0400 Subject: [PATCH 14/45] Update Dockerfile.tmpl --- Dockerfile.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 0ae3d01a..ed79684e 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -141,7 +141,7 @@ RUN pip install jax[cpu] && \ # Install mxnet {{ if eq .Accelerator "gpu" }} # No specific package for 11.3 minor versions, using 11.2 instead. -RUN pip install mxnet-cu$112 && \ +RUN pip install mxnet-cu112 && \ /tmp/clean-layer.sh {{ else }} RUN pip install mxnet && \ From df511c3103e3c304cc97e24f3772a31b541556ee Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 16 Aug 2022 17:57:41 -0400 Subject: [PATCH 15/45] remove extra $ --- Dockerfile.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index ed79684e..877eab4d 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -164,7 +164,7 @@ RUN pip install pycuda && \ pip install pynvrtc && \ pip install pynvml && \ # No specific package for 11.x minor versions. - pip install nnabla-ext-cuda$110 && \ + pip install nnabla-ext-cuda110 && \ /tmp/clean-layer.sh {{ end }} From a86e055990a89eb5603cf95bd6945fc817d8020d Mon Sep 17 00:00:00 2001 From: Dustin Herbison Date: Wed, 17 Aug 2022 15:58:22 +0000 Subject: [PATCH 16/45] update tensorflow decision forest and addons --- Dockerfile.tmpl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 877eab4d..a7264f02 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -179,8 +179,8 @@ RUN pip install pysal && \ pip install -f https://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o && \ pip install tensorflow-gcs-config==${TENSORFLOW_VERSION} && \ # TODO(b/207851560) Upgrade to 0.17.1 once the base image with TensorFlow 2.9.1 is out. - pip install tensorflow-addons==0.17.0 && \ - pip install tensorflow_decision_forests==0.2.0 && \ + pip install tensorflow-addons==0.17.1 && \ + pip install tensorflow_decision_forests==0.2.7 && \ /tmp/clean-layer.sh RUN apt-get install -y libfreetype6-dev && \ From 3df3b6dc4c964e4d2326d6348eb4abdd7732bb9c Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 19 Sep 2022 11:53:24 -0400 Subject: [PATCH 17/45] Update config.txt --- config.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.txt b/config.txt index 722540f7..fab6002b 100644 --- a/config.txt +++ b/config.txt @@ -1,5 +1,5 @@ BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release -BASE_IMAGE_TAG=m95 +BASE_IMAGE_TAG=m96 CPU_BASE_IMAGE_NAME=tf2-cpu.2-9 GPU_BASE_IMAGE_NAME=tf2-gpu.2-9 LIGHTGBM_VERSION=3.3.2 From 6bb31b0fa5393bee62d92f83f3e44164ea9d4d7c Mon Sep 17 00:00:00 2001 From: Dustin Herbison Date: Mon, 19 Sep 2022 19:12:08 +0000 Subject: [PATCH 18/45] fix pytorch audio and drop allennlp and dlib --- Dockerfile.tmpl | 12 +++++++++--- packages/torch.Dockerfile | 2 ++ tests/test_allennlp.py | 15 --------------- tests/test_dlib.py | 14 -------------- 4 files changed, 11 insertions(+), 32 deletions(-) delete mode 100644 tests/test_allennlp.py delete mode 100644 tests/test_dlib.py diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index a7264f02..1bea1757 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -402,7 +402,8 @@ RUN pip install bleach && \ pip install ipykernel && \ pip install ipython && \ pip install ipython-genutils && \ - pip install ipywidgets && \ + # Fix qgrid by pinning ipywidgets https://github.com/quantopian/qgrid/issues/376 + pip install ipywidgets==7.7.1 && \ pip install isoweek && \ pip install jedi && \ pip install jsonschema && \ @@ -445,7 +446,9 @@ RUN pip install bleach && \ pip install pyarrow && \ pip install feather-format && \ pip install fastai && \ - pip install allennlp && \ + # allennlp is in maintenance mode https://github.com/allenai/allennlp + # It downgrades the Pytorch install to 1.11. + #pip install allennlp && \ pip install importlib-metadata && \ python -m spacy download en_core_web_sm && python -m spacy download en_core_web_lg && \ apt-get install -y ffmpeg && \ @@ -504,7 +507,10 @@ RUN pip install flashtext && \ pip install transformers && \ # b/232247930 >= 2.2.0 requires pyarrow >= 6.0.0 which conflicts with dependencies for rapidsai 0.21.* pip install datasets==2.1.0 && \ - pip install dlib && \ + # dlib has a libmkl incompatibility: + # test_dlib_face_detector (test_dlib.TestDLib) ... INTEL MKL ERROR: /opt/conda/bin/../lib/libmkl_avx512.so.2: undefined symbol: mkl_sparse_optimize_bsr_trsm_i8. + # Intel MKL FATAL ERROR: Cannot load libmkl_avx512.so.2 or libmkl_def.so.2. + #pip install dlib && \ pip install kaggle-environments && \ pip install geopandas && \ pip install nnabla && \ diff --git a/packages/torch.Dockerfile b/packages/torch.Dockerfile index e9d2c167..82eca376 100644 --- a/packages/torch.Dockerfile +++ b/packages/torch.Dockerfile @@ -45,6 +45,8 @@ ENV BUILD_VERSION=$TORCHAUDIO_VERSION RUN sudo apt-get update && \ # ncurses.h is required for this install sudo apt-get install libncurses-dev && \ + # Fixing the build: https://github.com/pytorch/audio/issues/666#issuecomment-635928685 + conda install -c conda-forge ncurses && \ cd /usr/local/src && \ git clone https://github.com/pytorch/audio && \ cd audio && \ diff --git a/tests/test_allennlp.py b/tests/test_allennlp.py deleted file mode 100644 index 1bc80f2d..00000000 --- a/tests/test_allennlp.py +++ /dev/null @@ -1,15 +0,0 @@ -import unittest - -from allennlp.data.tokenizers import SpacyTokenizer - - -class TestAllenNlp(unittest.TestCase): - # reference - # https://github.com/allenai/allennlp/blob/master/allennlp/tests/data/tokenizers/word_tokenizer_test.py - def test_passes_through_correctly(self): - tokenizer = SpacyTokenizer() - sentence = "this (sentence) has 'crazy' \"punctuation\"." - tokens = [t.text for t in tokenizer.tokenize(sentence)] - expected_tokens = ["this", "(", "sentence", ")", "has", "'", "crazy", "'", "\"", - "punctuation", "\"", "."] - self.assertSequenceEqual(tokens, expected_tokens) diff --git a/tests/test_dlib.py b/tests/test_dlib.py deleted file mode 100644 index 9e8ff28d..00000000 --- a/tests/test_dlib.py +++ /dev/null @@ -1,14 +0,0 @@ -import unittest - -import cv2 -import dlib - - -class TestDLib(unittest.TestCase): - def test_dlib_face_detector(self): - detector = dlib.get_frontal_face_detector() - image = cv2.imread('/input/tests/data/face.jpg') - image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - faces = detector(image_gray, 1) - - self.assertEqual(len(faces), 1) From 54a0a3283ba9510a5bfa9e909f8390d95ebd5862 Mon Sep 17 00:00:00 2001 From: Dustin Herbison Date: Tue, 20 Sep 2022 16:36:00 +0000 Subject: [PATCH 19/45] fix cpu build, rm nnabla --- Dockerfile.tmpl | 10 ++++++---- tests/test_nnabla.py | 28 ---------------------------- 2 files changed, 6 insertions(+), 32 deletions(-) delete mode 100644 tests/test_nnabla.py diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 1bea1757..88603925 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -34,7 +34,7 @@ RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/lib {{ end }} # Keep these variables in sync if base image is updated. -ENV TENSORFLOW_VERSION=2.9.1 +ENV TENSORFLOW_VERSION=2.9.2 # We need to redefine the ARG here to get the ARG value defined above the FROM instruction. # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact @@ -177,7 +177,7 @@ RUN pip install pysal && \ # Use `conda install -c h2oai h2o` once Python 3.7 version is released to conda. apt-get install -y default-jre-headless && \ pip install -f https://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o && \ - pip install tensorflow-gcs-config==${TENSORFLOW_VERSION} && \ + pip install "tensorflow-gcs-config<=${TENSORFLOW_VERSION}" && \ # TODO(b/207851560) Upgrade to 0.17.1 once the base image with TensorFlow 2.9.1 is out. pip install tensorflow-addons==0.17.1 && \ pip install tensorflow_decision_forests==0.2.7 && \ @@ -496,7 +496,8 @@ RUN pip install flashtext && \ pip install optuna && \ pip install plotly_express && \ pip install albumentations && \ - pip install catalyst && \ + # Breaks protobuf compatibiilty in newer versions: + pip install catalyst tensorboardX==2.5.1 && \ # b/206990323 osmx 1.1.2 requires numpy >= 1.21 which we don't want. pip install osmnx==1.1.1 && \ apt-get -y install libspatialindex-dev && \ @@ -513,7 +514,8 @@ RUN pip install flashtext && \ #pip install dlib && \ pip install kaggle-environments && \ pip install geopandas && \ - pip install nnabla && \ + # Breaks protobuf compatibiilty: + #pip install nnabla && \ pip install vowpalwabbit && \ pip install pydub && \ pip install pydegensac && \ diff --git a/tests/test_nnabla.py b/tests/test_nnabla.py deleted file mode 100644 index ffb37fa4..00000000 --- a/tests/test_nnabla.py +++ /dev/null @@ -1,28 +0,0 @@ -import unittest - -import numpy as np -import nnabla as nn -import nnabla.functions as F -from nnabla.ext_utils import get_extension_context - -from common import gpu_test - - -class TestNNabla(unittest.TestCase): - def test_addition(self): - # entry variables - a = nn.Variable.from_numpy_array(np.random.random()) - b = nn.Variable.from_numpy_array(np.random.random()) - - # add operation - c = a + b - - # forward - c.forward() - - self.assertAlmostEqual(c.d, a.d + b.d, places=3) - - @gpu_test - def test_cuda_ext(self): - ctx = get_extension_context('cudnn', device_id='0') - nn.set_default_context(ctx) From d6986db59e4db7676c903018b2c3c28ab30d743e Mon Sep 17 00:00:00 2001 From: Dustin Herbison Date: Thu, 22 Sep 2022 13:56:43 +0000 Subject: [PATCH 20/45] gpu build timed out, increase build time to 12hr for now --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index df058e8b..f8ebdb68 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -122,7 +122,7 @@ pipeline { stages { stage('Build GPU Image') { options { - timeout(time: 180, unit: 'MINUTES') + timeout(time: 720, unit: 'MINUTES') } steps { sh '''#!/bin/bash From 0254f612baf3a69e88f48d0dcfb253416e39ad82 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 26 Sep 2022 12:06:20 -0400 Subject: [PATCH 21/45] Update Jenkinsfile --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index f8ebdb68..ebcd5403 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -122,7 +122,7 @@ pipeline { stages { stage('Build GPU Image') { options { - timeout(time: 720, unit: 'MINUTES') + timeout(time: 4324, unit: 'MINUTES') } steps { sh '''#!/bin/bash From 6ce9512390862c66ff810da5f0e3c4969260622b Mon Sep 17 00:00:00 2001 From: Dustin H Date: Fri, 30 Sep 2022 12:15:09 -0400 Subject: [PATCH 22/45] Trying m97 --- config.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.txt b/config.txt index fab6002b..c04d68fb 100644 --- a/config.txt +++ b/config.txt @@ -1,5 +1,5 @@ BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release -BASE_IMAGE_TAG=m96 +BASE_IMAGE_TAG=m97 CPU_BASE_IMAGE_NAME=tf2-cpu.2-9 GPU_BASE_IMAGE_NAME=tf2-gpu.2-9 LIGHTGBM_VERSION=3.3.2 From 3409a95cd89c10be993a30a442052ec1f9ea8a59 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Fri, 30 Sep 2022 14:55:39 -0400 Subject: [PATCH 23/45] Install rapids ai using mamba which is *incredibly* faster --- Dockerfile.tmpl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 88603925..9da2c772 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -89,7 +89,8 @@ RUN conda config --add channels nvidia && \ # b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version. RUN pip uninstall -y pyarrow && \ - conda install cudf=21.12 cuml=21.12 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \ + conda install -c conda-forge mamba && \ + mamba install cudf=21.12 cuml=21.12 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \ /tmp/clean-layer.sh {{ end }} From 0aed512871f4120e52ca2a339a3c32accf54e5af Mon Sep 17 00:00:00 2001 From: Dustin H Date: Fri, 30 Sep 2022 15:06:15 -0400 Subject: [PATCH 24/45] Allow mamba to find a compatible Rapids version --- Dockerfile.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 9da2c772..aa63e969 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -90,7 +90,7 @@ RUN conda config --add channels nvidia && \ # b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version. RUN pip uninstall -y pyarrow && \ conda install -c conda-forge mamba && \ - mamba install cudf=21.12 cuml=21.12 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \ + mamba install cudf cuml cudatoolkit && \ /tmp/clean-layer.sh {{ end }} From ef84269fd394bfc80015a9e7db326d9dbd08f3b5 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 3 Oct 2022 11:40:21 -0400 Subject: [PATCH 25/45] Trying m96 with mamba --- config.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.txt b/config.txt index c04d68fb..fab6002b 100644 --- a/config.txt +++ b/config.txt @@ -1,5 +1,5 @@ BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release -BASE_IMAGE_TAG=m97 +BASE_IMAGE_TAG=m96 CPU_BASE_IMAGE_NAME=tf2-cpu.2-9 GPU_BASE_IMAGE_NAME=tf2-gpu.2-9 LIGHTGBM_VERSION=3.3.2 From 9471f17134c8f6417985c3f99e4c380870c5b8b2 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 3 Oct 2022 15:21:25 -0400 Subject: [PATCH 26/45] implicit started using a Matrix type which was breaking this check --- tests/test_implicit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_implicit.py b/tests/test_implicit.py index 83eed4bb..bd18dc15 100644 --- a/tests/test_implicit.py +++ b/tests/test_implicit.py @@ -28,5 +28,5 @@ def test_model(self): model.fit(counts, show_progress=False) rows, cols = model.item_factors, model.user_factors - assert not np.isnan(np.sum(cols)) - assert not np.isnan(np.sum(rows)) + assert not np.isnan(np.sum(cols.to_numpy())) + assert not np.isnan(np.sum(rows.to_numpy())) From 613042ea83459b67d6126a5bf342772553d0c8b8 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 4 Oct 2022 09:47:56 -0400 Subject: [PATCH 27/45] Handle optional to_numpy --- tests/test_implicit.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test_implicit.py b/tests/test_implicit.py index bd18dc15..770e82b5 100644 --- a/tests/test_implicit.py +++ b/tests/test_implicit.py @@ -28,5 +28,11 @@ def test_model(self): model.fit(counts, show_progress=False) rows, cols = model.item_factors, model.user_factors - assert not np.isnan(np.sum(cols.to_numpy())) - assert not np.isnan(np.sum(rows.to_numpy())) + assert not np.isnan(np.sum(tonumpy(cols)) + assert not np.isnan(np.sum(tonumpy(rows)) + + +def tonumpy(x): + if hasattr(x, 'to_numpy'): + return x.to_numpy() + return x From fbf984e08741bc6aa1fefadbf786878e473ee9bc Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 4 Oct 2022 11:08:34 -0400 Subject: [PATCH 28/45] fix brackets --- tests/test_implicit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_implicit.py b/tests/test_implicit.py index 770e82b5..9ab5bbf4 100644 --- a/tests/test_implicit.py +++ b/tests/test_implicit.py @@ -28,8 +28,8 @@ def test_model(self): model.fit(counts, show_progress=False) rows, cols = model.item_factors, model.user_factors - assert not np.isnan(np.sum(tonumpy(cols)) - assert not np.isnan(np.sum(tonumpy(rows)) + assert not np.isnan(np.sum(tonumpy(cols))) + assert not np.isnan(np.sum(tonumpy(rows))) def tonumpy(x): From 29abf1aa22e513e6e6efdd437505736b94ac6f53 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 7 Nov 2022 16:24:45 -0500 Subject: [PATCH 29/45] Make torchtext & torchaudio load libomp http://b/255757999 --- Dockerfile.tmpl | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index eb05d878..3c8c3536 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -90,7 +90,7 @@ RUN conda config --add channels nvidia && \ # b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version. RUN pip uninstall -y pyarrow && \ conda install -c conda-forge mamba && \ - mamba install cudf cuml cudatoolkit && \ + mamba install -y cudf cuml cudatoolkit && \ /tmp/clean-layer.sh {{ end }} @@ -108,6 +108,13 @@ RUN conda install implicit && \ COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/ RUN conda install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} && \ pip install /tmp/torch/*.whl && \ + # b/255757999 openmp (libomp.so) is an dependency of libtorchtext and libtorchaudio but + # the built from source versions don't seem to properly link it in. This forces the dep + # which makes sure that libomp is loaded when these libraries are loaded. + conda install -y openmp && \ + pip install patchelf && \ + patchelf --add-needed libomp.so.5 /opt/conda/lib/python3.7/site-packages/torchtext/lib/libtorchtext.so && \ + patchelf --add-needed libomp.so.5 /opt/conda/lib/python3.7/site-packages/torchaudio/lib/libtorchaudio.so && \ rm -rf /tmp/torch && \ /tmp/clean-layer.sh {{ else }} From 8f31a858d5f1596f323261d7c62eef418d81dba2 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 7 Nov 2022 20:43:36 -0500 Subject: [PATCH 30/45] Update Dockerfile.tmpl --- Dockerfile.tmpl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 3c8c3536..0422db96 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -113,8 +113,8 @@ RUN conda install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION # which makes sure that libomp is loaded when these libraries are loaded. conda install -y openmp && \ pip install patchelf && \ - patchelf --add-needed libomp.so.5 /opt/conda/lib/python3.7/site-packages/torchtext/lib/libtorchtext.so && \ - patchelf --add-needed libomp.so.5 /opt/conda/lib/python3.7/site-packages/torchaudio/lib/libtorchaudio.so && \ + patchelf --add-needed libomp.so /opt/conda/lib/python3.7/site-packages/torchtext/lib/libtorchtext.so && \ + patchelf --add-needed libomp.so /opt/conda/lib/python3.7/site-packages/torchaudio/lib/libtorchaudio.so && \ rm -rf /tmp/torch && \ /tmp/clean-layer.sh {{ else }} From 87d59c98bb7d7d7afc0f405a0e82873dac356ba4 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Wed, 9 Nov 2022 15:08:36 -0500 Subject: [PATCH 31/45] Update Dockerfile.tmpl --- Dockerfile.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 0422db96..f7991ef5 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -90,7 +90,7 @@ RUN conda config --add channels nvidia && \ # b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version. RUN pip uninstall -y pyarrow && \ conda install -c conda-forge mamba && \ - mamba install -y cudf cuml cudatoolkit && \ + mamba install -y cudf cuml cudatoolkit==11.2.2 && \ /tmp/clean-layer.sh {{ end }} From abaa7f0abc6ad127fffe1d573c278811c894c0ce Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 21 Nov 2022 16:24:58 -0500 Subject: [PATCH 32/45] force cudatoolkit 11.2.2 --- Dockerfile.tmpl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index f7991ef5..00dcea92 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -90,7 +90,9 @@ RUN conda config --add channels nvidia && \ # b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version. RUN pip uninstall -y pyarrow && \ conda install -c conda-forge mamba && \ - mamba install -y cudf cuml cudatoolkit==11.2.2 && \ + mamba install -y cudf cuml && \ + # b/255757999: Force switch the cudatoolkit version back to the existing installation so it is compatible with pytorch + conda remove -y --force cudatoolkit=11.2.72 && mamba install --channel https://conda.anaconda.org/conda-forge/linux-64 cudatoolkit=11.2.2 && \ /tmp/clean-layer.sh {{ end }} From 962daca54d33f7402bbff5a5647fa00ebbc1ecc0 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 22 Nov 2022 09:56:00 -0500 Subject: [PATCH 33/45] Use mamba for all conda installs --- Dockerfile.tmpl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 00dcea92..fdf56c9b 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -80,16 +80,16 @@ ENV PROJ_LIB=/opt/conda/share/proj # the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/ RUN conda config --add channels nvidia && \ conda config --add channels rapidsai && \ + conda install -c conda-forge mamba && \ # Base image channel order: conda-forge (highest priority), defaults. # End state: rapidsai (highest priority), nvidia, conda-forge, defaults. - conda install mkl cartopy=0.19 imagemagick=7.1 pyproj==3.1.0 && \ + mamba install mkl cartopy=0.19 imagemagick=7.1 pyproj==3.1.0 && \ /tmp/clean-layer.sh {{ if eq .Accelerator "gpu" }} # b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version. RUN pip uninstall -y pyarrow && \ - conda install -c conda-forge mamba && \ mamba install -y cudf cuml && \ # b/255757999: Force switch the cudatoolkit version back to the existing installation so it is compatible with pytorch conda remove -y --force cudatoolkit=11.2.72 && mamba install --channel https://conda.anaconda.org/conda-forge/linux-64 cudatoolkit=11.2.2 && \ @@ -98,22 +98,22 @@ RUN pip uninstall -y pyarrow && \ # Install implicit {{ if eq .Accelerator "gpu" }} -RUN conda install implicit implicit-proc=*=gpu && \ +RUN mamba install implicit implicit-proc=*=gpu && \ /tmp/clean-layer.sh {{ else }} -RUN conda install implicit && \ +RUN mamba install implicit && \ /tmp/clean-layer.sh {{ end}} # Install PyTorch {{ if eq .Accelerator "gpu" }} COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/ -RUN conda install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} && \ +RUN mamba install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} && \ pip install /tmp/torch/*.whl && \ # b/255757999 openmp (libomp.so) is an dependency of libtorchtext and libtorchaudio but # the built from source versions don't seem to properly link it in. This forces the dep # which makes sure that libomp is loaded when these libraries are loaded. - conda install -y openmp && \ + mamba install -y openmp && \ pip install patchelf && \ patchelf --add-needed libomp.so /opt/conda/lib/python3.7/site-packages/torchtext/lib/libtorchtext.so && \ patchelf --add-needed libomp.so /opt/conda/lib/python3.7/site-packages/torchaudio/lib/libtorchaudio.so && \ From 6732bc31bb8a62037fca9079b167588f652e77d5 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 22 Nov 2022 09:58:03 -0500 Subject: [PATCH 34/45] Undo force cudatoolkit, causes inconsistent env --- Dockerfile.tmpl | 2 -- 1 file changed, 2 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index fdf56c9b..08b517e1 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -91,8 +91,6 @@ RUN conda config --add channels nvidia && \ # b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version. RUN pip uninstall -y pyarrow && \ mamba install -y cudf cuml && \ - # b/255757999: Force switch the cudatoolkit version back to the existing installation so it is compatible with pytorch - conda remove -y --force cudatoolkit=11.2.72 && mamba install --channel https://conda.anaconda.org/conda-forge/linux-64 cudatoolkit=11.2.2 && \ /tmp/clean-layer.sh {{ end }} From 2ec8ae929a7cc166455621de3f4dcd7f3ca21086 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 22 Nov 2022 10:09:53 -0500 Subject: [PATCH 35/45] Use mamba & include cuda upgrades in build --- packages/torch.Dockerfile | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/packages/torch.Dockerfile b/packages/torch.Dockerfile index 82eca376..a90ffd3a 100644 --- a/packages/torch.Dockerfile +++ b/packages/torch.Dockerfile @@ -12,9 +12,17 @@ ARG CUDA_MINOR_VERSION # TORCHVISION_VERSION is mandatory RUN test -n "$TORCHVISION_VERSION" +# Use mamba to speed up conda installs +RUN conda install -c conda-forge mamba + +# Install cudf/cuml so that cudatoolkit upgrades are included in the pytorch build +RUN conda config --add channels nvidia && \ + conda config --add channels rapidsai +RUN mamba install -y cudf cuml + # Build instructions: https://github.com/pytorch/pytorch#from-source -RUN conda install astunparse numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses -RUN conda install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} +RUN mamba install astunparse numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses +RUN mamba install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} # By default, it uses the version from version.txt which includes the `a0` (alpha zero) suffix and part of the git hash. # This causes dependency conflicts like these: https://paste.googleplex.com/4786486378496000 @@ -46,7 +54,7 @@ RUN sudo apt-get update && \ # ncurses.h is required for this install sudo apt-get install libncurses-dev && \ # Fixing the build: https://github.com/pytorch/audio/issues/666#issuecomment-635928685 - conda install -c conda-forge ncurses && \ + mamba install -c conda-forge ncurses && \ cd /usr/local/src && \ git clone https://github.com/pytorch/audio && \ cd audio && \ From 0910998fc9696d4ab0f1e8e0989b6cd33523d541 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 30 Jan 2023 12:28:11 -0500 Subject: [PATCH 36/45] Disable rapidsai until compatible with tf cudatoolkit --- Dockerfile.tmpl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 08b517e1..f1aeec4d 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -89,10 +89,11 @@ RUN conda config --add channels nvidia && \ {{ if eq .Accelerator "gpu" }} # b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version. -RUN pip uninstall -y pyarrow && \ - mamba install -y cudf cuml && \ - /tmp/clean-layer.sh -{{ end }} +# b/267180053: RapidsAI (cudf/cuml) are not compatible with the latest tensorflow cudatoolkit version. +# RUN pip uninstall -y pyarrow && \ +# mamba install -y cudf cuml && \ +# /tmp/clean-layer.sh +# {{ end }} # Install implicit {{ if eq .Accelerator "gpu" }} From 902a5fff15ed45c913c125e1167148c59e7489cc Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 30 Jan 2023 13:43:49 -0500 Subject: [PATCH 37/45] Update Dockerfile.tmpl --- Dockerfile.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 549ad46e..e23d4e94 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -195,7 +195,7 @@ RUN pip install pysal \ # Use `conda install -c h2oai h2o` once Python 3.7 version is released to conda. apt-get install -y default-jre-headless && \ pip install -f https://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o \ - tensorflow-gcs-config==<=${TENSORFLOW_VERSION}" \ + tensorflow-gcs-config<=${TENSORFLOW_VERSION} \ tensorflow-addons==0.17.1 \ tensorflow_decision_forests==0.2.7 && \ /tmp/clean-layer.sh From f9a5136e75b71b0c65cc74e3d01994e4c55cadca Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 30 Jan 2023 14:40:00 -0500 Subject: [PATCH 38/45] Update Dockerfile.tmpl --- Dockerfile.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index e23d4e94..c1c02bd7 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -195,7 +195,7 @@ RUN pip install pysal \ # Use `conda install -c h2oai h2o` once Python 3.7 version is released to conda. apt-get install -y default-jre-headless && \ pip install -f https://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o \ - tensorflow-gcs-config<=${TENSORFLOW_VERSION} \ + "tensorflow-gcs-config<=${TENSORFLOW_VERSION}" \ tensorflow-addons==0.17.1 \ tensorflow_decision_forests==0.2.7 && \ /tmp/clean-layer.sh From 537bd6ec7d0b2022f59e9abfa4ce350f40a8b758 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 30 Jan 2023 16:42:46 -0500 Subject: [PATCH 39/45] Update config.txt --- config.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.txt b/config.txt index fab6002b..c8ca1257 100644 --- a/config.txt +++ b/config.txt @@ -1,5 +1,5 @@ BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release -BASE_IMAGE_TAG=m96 +BASE_IMAGE_TAG=m103 CPU_BASE_IMAGE_NAME=tf2-cpu.2-9 GPU_BASE_IMAGE_NAME=tf2-gpu.2-9 LIGHTGBM_VERSION=3.3.2 From f52379a1f2c73738c211ea5190293dce4d62735b Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 31 Jan 2023 10:14:31 -0500 Subject: [PATCH 40/45] Fix gitextensions bug --- packages/torch.Dockerfile | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/packages/torch.Dockerfile b/packages/torch.Dockerfile index a90ffd3a..635d8f9e 100644 --- a/packages/torch.Dockerfile +++ b/packages/torch.Dockerfile @@ -15,11 +15,6 @@ RUN test -n "$TORCHVISION_VERSION" # Use mamba to speed up conda installs RUN conda install -c conda-forge mamba -# Install cudf/cuml so that cudatoolkit upgrades are included in the pytorch build -RUN conda config --add channels nvidia && \ - conda config --add channels rapidsai -RUN mamba install -y cudf cuml - # Build instructions: https://github.com/pytorch/pytorch#from-source RUN mamba install astunparse numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses RUN mamba install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} @@ -40,7 +35,7 @@ RUN cd /usr/local/src && \ cd pytorch && \ git checkout tags/v$PACKAGE_VERSION && \ git submodule sync && \ - git submodule update --init --recursive --jobs 0 && \ + git submodule update --init --recursive --jobs 1 && \ python setup.py bdist_wheel # Install torch which is required before we can build other torch* packages. @@ -60,7 +55,7 @@ RUN sudo apt-get update && \ cd audio && \ git checkout tags/v$TORCHAUDIO_VERSION && \ git submodule sync && \ - git submodule update --init --recursive --jobs 0 && \ + git submodule update --init --recursive --jobs 1 && \ python setup.py bdist_wheel # Build torchtext @@ -72,7 +67,7 @@ RUN cd /usr/local/src && \ cd text && \ git checkout tags/v$TORCHTEXT_VERSION && \ git submodule sync && \ - git submodule update --init --recursive --jobs 0 && \ + git submodule update --init --recursive --jobs 1 && \ python setup.py bdist_wheel # Build torchvision. From d527f3e885ba25adee009bf405b6c0fc492a8d82 Mon Sep 17 00:00:00 2001 From: Dustin Herbison Date: Mon, 6 Feb 2023 18:59:37 +0000 Subject: [PATCH 41/45] try 2.11 --- Dockerfile.tmpl | 2 +- config.txt | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index c1c02bd7..e3bbdf81 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -34,7 +34,7 @@ RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/lib {{ end }} # Keep these variables in sync if base image is updated. -ENV TENSORFLOW_VERSION=2.9.2 +ENV TENSORFLOW_VERSION=2.11.0 # We need to redefine the ARG here to get the ARG value defined above the FROM instruction. # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact diff --git a/config.txt b/config.txt index c8ca1257..4fb11971 100644 --- a/config.txt +++ b/config.txt @@ -1,7 +1,7 @@ BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release BASE_IMAGE_TAG=m103 -CPU_BASE_IMAGE_NAME=tf2-cpu.2-9 -GPU_BASE_IMAGE_NAME=tf2-gpu.2-9 +CPU_BASE_IMAGE_NAME=tf2-cpu.2-11 +GPU_BASE_IMAGE_NAME=tf2-gpu.2-11 LIGHTGBM_VERSION=3.3.2 TORCH_VERSION=1.12.0 TORCHAUDIO_VERSION=0.12.0 From 3aee97700259732ed0f921fde047fedc0dde2acc Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 7 Feb 2023 12:58:39 -0500 Subject: [PATCH 42/45] Update config.txt --- config.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config.txt b/config.txt index 4fb11971..fab6002b 100644 --- a/config.txt +++ b/config.txt @@ -1,7 +1,7 @@ BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release -BASE_IMAGE_TAG=m103 -CPU_BASE_IMAGE_NAME=tf2-cpu.2-11 -GPU_BASE_IMAGE_NAME=tf2-gpu.2-11 +BASE_IMAGE_TAG=m96 +CPU_BASE_IMAGE_NAME=tf2-cpu.2-9 +GPU_BASE_IMAGE_NAME=tf2-gpu.2-9 LIGHTGBM_VERSION=3.3.2 TORCH_VERSION=1.12.0 TORCHAUDIO_VERSION=0.12.0 From 598205338884b322289a6e1eeffa54942f093dc3 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 7 Feb 2023 12:59:23 -0500 Subject: [PATCH 43/45] Update Dockerfile.tmpl --- Dockerfile.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index e3bbdf81..c1c02bd7 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -34,7 +34,7 @@ RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/lib {{ end }} # Keep these variables in sync if base image is updated. -ENV TENSORFLOW_VERSION=2.11.0 +ENV TENSORFLOW_VERSION=2.9.2 # We need to redefine the ARG here to get the ARG value defined above the FROM instruction. # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact From c654412d44af23bd119904f43a6e730c6ef03b08 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Wed, 8 Feb 2023 14:00:55 -0500 Subject: [PATCH 44/45] Update Dockerfile.tmpl --- Dockerfile.tmpl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index c1c02bd7..82ff1cff 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -623,6 +623,9 @@ RUN jupyter-nbextension disable nb_conda --py --sys-prefix && \ jupyter-serverextension disable nb_conda --py --sys-prefix && \ python -m nb_conda_kernels.install --disable +# Force only one libcusolver +RUN rm /opt/conda/bin/../lib/libcusolver.so.11 && ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/bin/../lib/libcusolver.so.11 + # Set backend for matplotlib ENV MPLBACKEND "agg" From 8f810aff0ffa0ad9e2fe8f7fcb4fb9856ed5b1c9 Mon Sep 17 00:00:00 2001 From: Dustin Herbison Date: Wed, 8 Feb 2023 20:26:50 +0000 Subject: [PATCH 45/45] remove test_repids.py --- tests/test_rapids.py | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 tests/test_rapids.py diff --git a/tests/test_rapids.py b/tests/test_rapids.py deleted file mode 100644 index 3b7239dd..00000000 --- a/tests/test_rapids.py +++ /dev/null @@ -1,22 +0,0 @@ -import unittest - -from common import gpu_test - - -class TestRapids(unittest.TestCase): - @gpu_test - def test_dbscan(self): - import cudf - from cuml.cluster import DBSCAN - - # Create and populate a GPU DataFrame - gdf_float = cudf.DataFrame() - gdf_float['0'] = [1.0, 2.0, 5.0] - gdf_float['1'] = [4.0, 2.0, 1.0] - gdf_float['2'] = [4.0, 2.0, 1.0] - - # Setup and fit clusters - dbscan_float = DBSCAN(eps=1.0, min_samples=1) - dbscan_float.fit(gdf_float) - - self.assertEqual(3, dbscan_float.labels_.size)