diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index c6c7a61d..82ff1cff 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -12,6 +12,15 @@ ARG TORCHVISION_VERSION FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl FROM ${BASE_IMAGE_REPO}/${GPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} +{{ else }} +FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} +{{ end }} + +# Ensures shared libraries installed with conda can be found by the dynamic link loader. +ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" + +{{ if eq .Accelerator "gpu" }} ARG CUDA_MAJOR_VERSION ARG CUDA_MINOR_VERSION ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} @@ -22,11 +31,10 @@ ENV PATH=/opt/bin:${PATH} ENV LD_LIBRARY_PATH_NO_STUBS="$LD_LIBRARY_PATH" ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs" RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 -{{ else }} -FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} {{ end }} + # Keep these variables in sync if base image is updated. -ENV TENSORFLOW_VERSION=2.6.4 +ENV TENSORFLOW_VERSION=2.9.2 # We need to redefine the ARG here to get the ARG value defined above the FROM instruction. # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact @@ -76,33 +84,42 @@ ENV PROJ_LIB=/opt/conda/share/proj # the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/ RUN conda config --add channels nvidia && \ conda config --add channels rapidsai && \ + conda install -c conda-forge mamba && \ # Base image channel order: conda-forge (highest priority), defaults. # End state: rapidsai (highest priority), nvidia, conda-forge, defaults. - conda install mkl cartopy=0.19 imagemagick=7.1 pyproj==3.1.0 && \ + mamba install mkl cartopy=0.19 imagemagick=7.1 pyproj==3.1.0 && \ /tmp/clean-layer.sh {{ if eq .Accelerator "gpu" }} # b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version. -RUN pip uninstall -y pyarrow && \ - conda install cudf=21.10 cuml=21.10 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \ - /tmp/clean-layer.sh -{{ end }} +# b/267180053: RapidsAI (cudf/cuml) are not compatible with the latest tensorflow cudatoolkit version. +# RUN pip uninstall -y pyarrow && \ +# mamba install -y cudf cuml && \ +# /tmp/clean-layer.sh +# {{ end }} # Install implicit {{ if eq .Accelerator "gpu" }} -RUN conda install implicit implicit-proc=*=gpu && \ +RUN mamba install implicit implicit-proc=*=gpu && \ /tmp/clean-layer.sh {{ else }} -RUN conda install implicit && \ +RUN mamba install implicit && \ /tmp/clean-layer.sh {{ end}} # Install PyTorch {{ if eq .Accelerator "gpu" }} COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/ -RUN conda install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} && \ +RUN mamba install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} && \ pip install /tmp/torch/*.whl && \ + # b/255757999 openmp (libomp.so) is an dependency of libtorchtext and libtorchaudio but + # the built from source versions don't seem to properly link it in. This forces the dep + # which makes sure that libomp is loaded when these libraries are loaded. + mamba install -y openmp && \ + pip install patchelf && \ + patchelf --add-needed libomp.so /opt/conda/lib/python3.7/site-packages/torchtext/lib/libtorchtext.so && \ + patchelf --add-needed libomp.so /opt/conda/lib/python3.7/site-packages/torchaudio/lib/libtorchaudio.so && \ rm -rf /tmp/torch && \ /tmp/clean-layer.sh {{ else }} @@ -141,7 +158,8 @@ RUN pip install jax[cpu] && \ # Install mxnet {{ if eq .Accelerator "gpu" }} -RUN pip install mxnet-cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \ +# No specific package for 11.3 minor versions, using 11.2 instead. +RUN pip install mxnet-cu112 && \ /tmp/clean-layer.sh {{ else }} RUN pip install mxnet && \ @@ -160,10 +178,11 @@ RUN pip install spacy && \ # Install GPU specific packages {{ if eq .Accelerator "gpu" }} # Install GPU-only packages +# No specific package for nnabla-ext-cuda 11.x minor versions. RUN pip install pycuda \ pynvrtc \ pynvml \ - nnabla-ext-cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \ + nnabla-ext-cuda${CUDA_MAJOR_VERSION}0 && \ /tmp/clean-layer.sh {{ end }} @@ -176,9 +195,9 @@ RUN pip install pysal \ # Use `conda install -c h2oai h2o` once Python 3.7 version is released to conda. apt-get install -y default-jre-headless && \ pip install -f https://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o \ - tensorflow-gcs-config==2.6.0 \ - tensorflow-addons==0.14.0 \ - tensorflow_decision_forests==0.2.0 && \ + "tensorflow-gcs-config<=${TENSORFLOW_VERSION}" \ + tensorflow-addons==0.17.1 \ + tensorflow_decision_forests==0.2.7 && \ /tmp/clean-layer.sh RUN apt-get install -y libfreetype6-dev && \ @@ -393,6 +412,8 @@ RUN pip install cython \ mlcrate && \ /tmp/clean-layer.sh + +# Fix qgrid by pinning ipywidgets https://github.com/quantopian/qgrid/issues/376 RUN pip install bleach \ certifi \ cycler \ @@ -402,7 +423,7 @@ RUN pip install bleach \ ipykernel \ ipython \ ipython-genutils \ - ipywidgets \ + ipywidgets==7.7.1 \ isoweek \ jedi \ jsonschema \ @@ -459,6 +480,10 @@ RUN pip install bleach \ # ########### +# dlib has a libmkl incompatibility: +# test_dlib_face_detector (test_dlib.TestDLib) ... INTEL MKL ERROR: /opt/conda/bin/../lib/libmkl_avx512.so.2: undefined symbol: mkl_sparse_optimize_bsr_trsm_i8. +# Intel MKL FATAL ERROR: Cannot load libmkl_avx512.so.2 or libmkl_def.so.2. +# nnabla breaks protobuf compatibiilty: RUN pip install flashtext \ wandb \ # b/214080882 blake3 0.3.0 is not compatible with vaex. @@ -505,10 +530,8 @@ RUN pip install flashtext \ transformers \ # b/232247930 >= 2.2.0 requires pyarrow >= 6.0.0 which conflicts with dependencies for rapidsai 0.21.* datasets==2.1.0 \ - dlib \ kaggle-environments \ geopandas \ - nnabla \ vowpalwabbit \ pydub \ pydegensac \ @@ -600,6 +623,9 @@ RUN jupyter-nbextension disable nb_conda --py --sys-prefix && \ jupyter-serverextension disable nb_conda --py --sys-prefix && \ python -m nb_conda_kernels.install --disable +# Force only one libcusolver +RUN rm /opt/conda/bin/../lib/libcusolver.so.11 && ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/bin/../lib/libcusolver.so.11 + # Set backend for matplotlib ENV MPLBACKEND "agg" diff --git a/Jenkinsfile b/Jenkinsfile index 895664ab..92b1b85d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -106,7 +106,7 @@ pipeline { stages { stage('Build GPU Image') { options { - timeout(time: 180, unit: 'MINUTES') + timeout(time: 4324, unit: 'MINUTES') } steps { sh '''#!/bin/bash diff --git a/config.txt b/config.txt index 226e6f9e..fab6002b 100644 --- a/config.txt +++ b/config.txt @@ -1,12 +1,11 @@ BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release -BASE_IMAGE_TAG=m94 -CPU_BASE_IMAGE_NAME=tf2-cpu.2-6 -GPU_BASE_IMAGE_NAME=tf2-gpu.2-6 +BASE_IMAGE_TAG=m96 +CPU_BASE_IMAGE_NAME=tf2-cpu.2-9 +GPU_BASE_IMAGE_NAME=tf2-gpu.2-9 LIGHTGBM_VERSION=3.3.2 -TORCH_VERSION=1.11.0 -# TODO(b/215031404#comment4) Remove zlib sed command after upgrade to >= 0.11.1 -TORCHAUDIO_VERSION=0.11.0 -TORCHTEXT_VERSION=0.12.0 -TORCHVISION_VERSION=0.12.0 +TORCH_VERSION=1.12.0 +TORCHAUDIO_VERSION=0.12.0 +TORCHTEXT_VERSION=0.13.0 +TORCHVISION_VERSION=0.13.0 CUDA_MAJOR_VERSION=11 -CUDA_MINOR_VERSION=0 +CUDA_MINOR_VERSION=3 diff --git a/packages/torch.Dockerfile b/packages/torch.Dockerfile index 0843082b..635d8f9e 100644 --- a/packages/torch.Dockerfile +++ b/packages/torch.Dockerfile @@ -12,15 +12,22 @@ ARG CUDA_MINOR_VERSION # TORCHVISION_VERSION is mandatory RUN test -n "$TORCHVISION_VERSION" +# Use mamba to speed up conda installs +RUN conda install -c conda-forge mamba + # Build instructions: https://github.com/pytorch/pytorch#from-source -RUN conda install astunparse numpy ninja pyyaml mkl mkl-include setuptools==59.5.0 cmake cffi typing_extensions future six requests dataclasses -RUN conda install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} +RUN mamba install astunparse numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses +RUN mamba install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} # By default, it uses the version from version.txt which includes the `a0` (alpha zero) suffix and part of the git hash. # This causes dependency conflicts like these: https://paste.googleplex.com/4786486378496000 ENV PYTORCH_BUILD_VERSION=$PACKAGE_VERSION ENV PYTORCH_BUILD_NUMBER=1 +# Ensures shared libraries installed with conda can be found by the dynamic link loader. +# For PyTorch, we need specifically mkl. +ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" ENV TORCH_CUDA_ARCH_LIST="3.7;6.0;7.0+PTX;7.5+PTX" ENV FORCE_CUDA=1 RUN cd /usr/local/src && \ @@ -28,7 +35,7 @@ RUN cd /usr/local/src && \ cd pytorch && \ git checkout tags/v$PACKAGE_VERSION && \ git submodule sync && \ - git submodule update --init --recursive --jobs 0 && \ + git submodule update --init --recursive --jobs 1 && \ python setup.py bdist_wheel # Install torch which is required before we can build other torch* packages. @@ -38,14 +45,17 @@ RUN pip install /usr/local/src/pytorch/dist/*.whl # Instructions: https://github.com/pytorch/audio#from-source # See comment above for PYTORCH_BUILD_VERSION. ENV BUILD_VERSION=$TORCHAUDIO_VERSION -RUN cd /usr/local/src && \ +RUN sudo apt-get update && \ + # ncurses.h is required for this install + sudo apt-get install libncurses-dev && \ + # Fixing the build: https://github.com/pytorch/audio/issues/666#issuecomment-635928685 + mamba install -c conda-forge ncurses && \ + cd /usr/local/src && \ git clone https://github.com/pytorch/audio && \ cd audio && \ git checkout tags/v$TORCHAUDIO_VERSION && \ git submodule sync && \ - git submodule update --init --recursive --jobs 0 && \ - # TODO(b/215031404#comment4) Remove after upgrade next release (0.11.1) - sed -i s?https://zlib.net/zlib-1.2.11.tar.gz?https://sourceforge.net/projects/libpng/files/zlib/1.2.11/zlib-1.2.11.tar.gz? third_party/zlib/CMakeLists.txt && \ + git submodule update --init --recursive --jobs 1 && \ python setup.py bdist_wheel # Build torchtext @@ -57,7 +67,7 @@ RUN cd /usr/local/src && \ cd text && \ git checkout tags/v$TORCHTEXT_VERSION && \ git submodule sync && \ - git submodule update --init --recursive --jobs 0 && \ + git submodule update --init --recursive --jobs 1 && \ python setup.py bdist_wheel # Build torchvision. @@ -81,4 +91,4 @@ COPY --from=builder /usr/local/src/text/dist/*.whl /tmp/whl COPY --from=builder /usr/local/src/vision/dist/*.whl /tmp/whl # Print out the built .whl file. -RUN ls -lh /tmp/whl/ \ No newline at end of file +RUN ls -lh /tmp/whl/ diff --git a/tests/test_allennlp.py b/tests/test_allennlp.py deleted file mode 100644 index 1bc80f2d..00000000 --- a/tests/test_allennlp.py +++ /dev/null @@ -1,15 +0,0 @@ -import unittest - -from allennlp.data.tokenizers import SpacyTokenizer - - -class TestAllenNlp(unittest.TestCase): - # reference - # https://github.com/allenai/allennlp/blob/master/allennlp/tests/data/tokenizers/word_tokenizer_test.py - def test_passes_through_correctly(self): - tokenizer = SpacyTokenizer() - sentence = "this (sentence) has 'crazy' \"punctuation\"." - tokens = [t.text for t in tokenizer.tokenize(sentence)] - expected_tokens = ["this", "(", "sentence", ")", "has", "'", "crazy", "'", "\"", - "punctuation", "\"", "."] - self.assertSequenceEqual(tokens, expected_tokens) diff --git a/tests/test_dlib.py b/tests/test_dlib.py deleted file mode 100644 index 9e8ff28d..00000000 --- a/tests/test_dlib.py +++ /dev/null @@ -1,14 +0,0 @@ -import unittest - -import cv2 -import dlib - - -class TestDLib(unittest.TestCase): - def test_dlib_face_detector(self): - detector = dlib.get_frontal_face_detector() - image = cv2.imread('/input/tests/data/face.jpg') - image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - faces = detector(image_gray, 1) - - self.assertEqual(len(faces), 1) diff --git a/tests/test_implicit.py b/tests/test_implicit.py index 83eed4bb..9ab5bbf4 100644 --- a/tests/test_implicit.py +++ b/tests/test_implicit.py @@ -28,5 +28,11 @@ def test_model(self): model.fit(counts, show_progress=False) rows, cols = model.item_factors, model.user_factors - assert not np.isnan(np.sum(cols)) - assert not np.isnan(np.sum(rows)) + assert not np.isnan(np.sum(tonumpy(cols))) + assert not np.isnan(np.sum(tonumpy(rows))) + + +def tonumpy(x): + if hasattr(x, 'to_numpy'): + return x.to_numpy() + return x diff --git a/tests/test_nnabla.py b/tests/test_nnabla.py deleted file mode 100644 index ffb37fa4..00000000 --- a/tests/test_nnabla.py +++ /dev/null @@ -1,28 +0,0 @@ -import unittest - -import numpy as np -import nnabla as nn -import nnabla.functions as F -from nnabla.ext_utils import get_extension_context - -from common import gpu_test - - -class TestNNabla(unittest.TestCase): - def test_addition(self): - # entry variables - a = nn.Variable.from_numpy_array(np.random.random()) - b = nn.Variable.from_numpy_array(np.random.random()) - - # add operation - c = a + b - - # forward - c.forward() - - self.assertAlmostEqual(c.d, a.d + b.d, places=3) - - @gpu_test - def test_cuda_ext(self): - ctx = get_extension_context('cudnn', device_id='0') - nn.set_default_context(ctx) diff --git a/tests/test_rapids.py b/tests/test_rapids.py deleted file mode 100644 index 3b7239dd..00000000 --- a/tests/test_rapids.py +++ /dev/null @@ -1,22 +0,0 @@ -import unittest - -from common import gpu_test - - -class TestRapids(unittest.TestCase): - @gpu_test - def test_dbscan(self): - import cudf - from cuml.cluster import DBSCAN - - # Create and populate a GPU DataFrame - gdf_float = cudf.DataFrame() - gdf_float['0'] = [1.0, 2.0, 5.0] - gdf_float['1'] = [4.0, 2.0, 1.0] - gdf_float['2'] = [4.0, 2.0, 1.0] - - # Setup and fit clusters - dbscan_float = DBSCAN(eps=1.0, min_samples=1) - dbscan_float.fit(gdf_float) - - self.assertEqual(3, dbscan_float.labels_.size)