diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 11b05ebd..9d661201 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -1,70 +1,46 @@ -ARG BASE_IMAGE_REPO \ - BASE_IMAGE_TAG \ - CPU_BASE_IMAGE_NAME \ - GPU_BASE_IMAGE_NAME \ - LIGHTGBM_VERSION \ - TORCH_VERSION \ - TORCHAUDIO_VERSION \ - TORCHVISION_VERSION \ - JAX_VERSION - {{ if eq .Accelerator "gpu" }} -FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl -FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl -FROM gcr.io/kaggle-images/python-jaxlib-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${JAX_VERSION} AS jaxlib_whl -FROM ${BASE_IMAGE_REPO}/${GPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} +FROM us-docker.pkg.dev/colab-images/public/runtime:release-colab-external_20260126-060048_RC00 {{ else }} -FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} -{{ end }} +FROM us-docker.pkg.dev/colab-images/public/cpu-runtime:release-colab-external_20260123-060023_RC00 +{{ end}} -# Ensures shared libraries installed with conda can be found by the dynamic link loader. -ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" \ - LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" +ADD kaggle_requirements.txt /kaggle_requirements.txt -{{ if eq .Accelerator "gpu" }} -ARG CUDA_MAJOR_VERSION \ - CUDA_MINOR_VERSION -ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} \ - CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION} -# Make sure we are on the right version of CUDA -RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION -# NVIDIA binaries from the host are mounted to /opt/bin. -ENV PATH=/opt/bin:${PATH} \ - # Add CUDA stubs to LD_LIBRARY_PATH to support building the GPU image on a CPU machine. - LD_LIBRARY_PATH_NO_STUBS="$LD_LIBRARY_PATH" \ - LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs" -RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 -{{ end }} +# Freeze existing requirements from base image for critical packages: +RUN pip freeze | grep -E 'tensorflow|keras|torch|jax' > /colab_requirements.txt + +# Merge requirements files: +RUN cat /colab_requirements.txt >> /requirements.txt +RUN cat /kaggle_requirements.txt >> /requirements.txt -# Keep these variables in sync if base image is updated. -ENV TENSORFLOW_VERSION=2.16.1 \ - # See https://github.com/tensorflow/io#tensorflow-version-compatibility - TENSORFLOW_IO_VERSION=0.37.0 +# Install Kaggle packages +RUN uv pip install --system --no-cache -r /requirements.txt -# We need to redefine the ARG here to get the ARG value defined above the FROM instruction. -# See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact -ARG LIGHTGBM_VERSION \ - TORCH_VERSION \ - TORCHAUDIO_VERSION \ - TORCHVISION_VERSION \ - JAX_VERSION +# Install manual packages: +# b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. +RUN uv pip uninstall --system --no-cache google-cloud-bigquery-storage -# Disable pesky logs like: KMP_AFFINITY: pid 6121 tid 6121 thread 0 bound to OS proc set 0 -# See: https://stackoverflow.com/questions/57385766/disable-tensorflow-log-information -ENV KMP_WARNINGS=0 \ - # Also make the KMP logs noverbose. - # https://stackoverflow.com/questions/70250304/stop-tensorflow-from-printing-warning-message - KMP_SETTINGS=false \ - # Remove the pip as the root user warning. - PIP_ROOT_USER_ACTION=ignore +# uv cannot install this in requirements.txt without --no-build-isolation +# to avoid affecting the larger build, we'll post-install it. +RUN uv pip install --no-build-isolation --no-cache --system "git+https://github.com/Kaggle/learntools" +# b/404590350: Ray and torchtune have conflicting cli named `tune`. `ray` is not part of Colab's base image. Re-install `tune` to ensure the torchtune CLI is available by default. +# b/468367647: Unpin protobuf, version greater than v5.29.5 causes issues with numerous packages +RUN uv pip install --system --force-reinstall --no-cache --no-deps torchtune +RUN uv pip install --system --force-reinstall --no-cache "protobuf==5.29.5" + +# Adding non-package dependencies: ADD clean-layer.sh /tmp/clean-layer.sh ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl ADD patches/template_conf.json /opt/kaggle/conf.json -# Update GPG key per documentation at https://cloud.google.com/compute/docs/troubleshooting/known-issues -RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - -RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - +ARG PACKAGE_PATH=/usr/local/lib/python3.12/dist-packages + +# Install GPU-specific non-pip packages. +{{ if eq .Accelerator "gpu" }} +RUN uv pip install --system --no-cache "pycuda" +{{ end }} + # Use a fixed apt-get repo to stop intermittent failures due to flaky httpredir connections, # as described by Lionel Chan at http://stackoverflow.com/a/37426929/5881346 @@ -78,157 +54,18 @@ RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list & apt-get install -y graphviz && pip install graphviz && \ /tmp/clean-layer.sh -# b/128333086: Set PROJ_DATA to points to the proj4 cartographic library. -ENV PROJ_DATA=/opt/conda/share/proj - -# Install micromamba, setup channels, and replace conda with micromamba -ENV MAMBA_ROOT_PREFIX=/opt/conda -RUN curl -L "https://micro.mamba.pm/install.sh" -o /tmp/micromamba-install.sh \ - && bash /tmp/micromamba-install.sh \ - && rm /tmp/micromamba-install.sh \ - && mv ~/.local/bin/micromamba /usr/bin/micromamba \ - && (!(which conda) || cp /usr/bin/micromamba $(which conda)) \ - && micromamba config append channels nvidia \ - && micromamba config append channels rapidsai \ - && micromamba config append channels conda-forge \ - && micromamba config set channel_priority flexible \ - && python -m nb_conda_kernels.install --disable - -# Install conda packages not available on pip. -# When using pip in a conda environment, conda commands should be ran first and then -# the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/ -RUN micromamba install -y mkl cartopy imagemagick pyproj "shapely<2" && \ - rm -rf /opt/conda/lib/python3.10/site-packages/pyproj/proj_dir/ && \ - /tmp/clean-layer.sh - -# Install spacy -# b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version. -# b/341938540: unistall grpc-cpp to allow >=v24.4 cudf and cuml to be installed. -{{ if eq .Accelerator "gpu" }} -RUN pip uninstall -y pyarrow && \ - micromamba install -vvvy spacy "cudf>=24.4" "cuml>=24.4" cupy cuda-version=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \ - /tmp/clean-layer.sh -{{ else }} -RUN pip install spacy && \ - /tmp/clean-layer.sh -{{ end}} - -# Install PyTorch -# b/356397043: magma-cuda121 is the latest version -{{ if eq .Accelerator "gpu" }} -COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/ -# b/356397043: We are currently using cuda 12.3, -# but magma-cuda121 is the latest compatible version -RUN micromamba install -y -c pytorch magma-cuda121 && \ - pip install /tmp/torch/*.whl && \ - sudo apt -y install libsox-dev && \ - rm -rf /tmp/torch && \ - /tmp/clean-layer.sh -{{ else }} -RUN pip install \ - torch==$TORCH_VERSION+cpu \ - torchvision==$TORCHVISION_VERSION+cpu \ - torchaudio==$TORCHAUDIO_VERSION+cpu \ - --index-url https://download.pytorch.org/whl/cpu && \ - /tmp/clean-layer.sh -{{ end }} - -# Install LightGBM -{{ if eq .Accelerator "gpu" }} -COPY --from=lightgbm_whl /tmp/whl/*.whl /tmp/lightgbm/ -# Install OpenCL (required by LightGBM GPU version) -RUN apt-get install -y ocl-icd-libopencl1 clinfo && \ - mkdir -p /etc/OpenCL/vendors && \ - echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ - pip install /tmp/lightgbm/*.whl && \ - rm -rf /tmp/lightgbm && \ - /tmp/clean-layer.sh -{{ else }} -RUN pip install lightgbm==$LIGHTGBM_VERSION && \ - /tmp/clean-layer.sh -{{ end }} - -# Install JAX -{{ if eq .Accelerator "gpu" }} -COPY --from=jaxlib_whl /tmp/whl/*.whl /tmp/jax/ -# b/319722433#comment9: Use pip wheels once versions matches our CUDA version. -RUN pip install /tmp/jax/*.whl jax==$JAX_VERSION && \ - /tmp/clean-layer.sh -{{ else }} -RUN pip install jax[cpu] && \ - /tmp/clean-layer.sh -{{ end }} - - -# Install GPU specific packages -{{ if eq .Accelerator "gpu" }} -# Install GPU-only packages -# No specific package for nnabla-ext-cuda 12.x minor versions. -RUN export PATH=/usr/local/cuda/bin:$PATH && \ - export CUDA_ROOT=/usr/local/cuda && \ - pip install pycuda \ - pynvrtc \ - pynvml && \ - /tmp/clean-layer.sh -{{ end }} - -# b/308525631: Pin Matplotlib until seaborn can be upgraded -# to >0.13.0 (now it's stuck by a package conflict with ydata-profiling 4.5.1). -RUN JAXVER=$(pip freeze | grep -e "^jax==") && \ - pip install --upgrade \ - "matplotlib<3.8.0" \ - # ipympl adds interactive widget support for matplotlib - ipympl==0.7.0 \ - "seaborn==0.12.2" \ - pyupset \ - python-dateutil dask dask-expr igraph \ - pyyaml joblib geopy mne pyshp \ - pandas \ - polars \ - flax \ - "${JAXVER}" && \ - /tmp/clean-layer.sh - -RUN apt-get update && \ - apt-get install -y default-jre && \ - /tmp/clean-layer.sh - -RUN pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o && /tmp/clean-layer.sh - -RUN pip install \ - "tensorflow==${TENSORFLOW_VERSION}" \ - "tensorflow-io==${TENSORFLOW_IO_VERSION}" \ - tensorflow-probability \ - tensorflow_decision_forests \ - tensorflow-text \ - "tensorflow_hub>=0.16.0" \ - tf-keras \ - "keras>3" \ - keras-cv \ - keras-nlp && \ - /tmp/clean-layer.sh - ADD patches/keras_internal.py \ patches/keras_internal_test.py \ - /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/ + $PACKAGE_PATH/tensorflow_decision_forests/keras/ -# b/350573866: xgboost v2.1.0 breaks learntools RUN apt-get install -y libfreetype6-dev && \ apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing && \ - rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ - pip install "numpy==1.26.4" && \ - pip install gensim \ - textblob \ - wordcloud \ - "xgboost==2.0.3" \ - pydot \ - hep_ml && \ - # NLTK Project datasets - mkdir -p /usr/share/nltk_data && \ + /tmp/clean-layer.sh + +RUN mkdir -p /usr/share/nltk_data && \ # NLTK Downloader no longer continues smoothly after an error, so we explicitly list # the corpuses that work - # "yes | ..." answers yes to the retry prompt in case of an error. See b/133762095. - yes | python -m nltk.downloader -d /usr/share/nltk_data abc alpino averaged_perceptron_tagger \ + python -m nltk.downloader -d /usr/share/nltk_data abc alpino averaged_perceptron_tagger \ basque_grammars biocreative_ppi bllip_wsj_no_aux \ book_grammars brown brown_tei cess_cat cess_esp chat80 city_database cmudict \ comtrans conll2000 conll2002 conll2007 crubadan dependency_treebank \ @@ -237,282 +74,17 @@ RUN apt-get install -y libfreetype6-dev && \ masc_tagged maxent_ne_chunker maxent_treebank_pos_tagger moses_sample movie_reviews \ mte_teip5 names nps_chat omw opinion_lexicon paradigms \ pil pl196x porter_test ppattach problem_reports product_reviews_1 product_reviews_2 propbank \ - pros_cons ptb punkt qc reuters rslp rte sample_grammars semcor senseval sentence_polarity \ + pros_cons ptb punkt punkt_tab qc reuters rslp rte sample_grammars semcor senseval sentence_polarity \ sentiwordnet shakespeare sinica_treebank smultron snowball_data spanish_grammars \ state_union stopwords subjectivity swadesh switchboard tagsets timit toolbox treebank \ twitter_samples udhr2 udhr unicode_samples universal_tagset universal_treebanks_v20 \ - vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe && \ - pip install scikit-image && \ - pip install opencv-contrib-python opencv-python && \ - /tmp/clean-layer.sh - -RUN pip install cython \ - fasttext \ - opencv-contrib-python \ - opencv-python \ - "scipy<1.14.0" \ - # Scikit-learn accelerated library for x86 - "scikit-learn-intelex>=2023.0.1" \ - # HDF5 support - h5py \ - # PUDB, for local debugging convenience - pudb \ - imbalanced-learn \ - # Profiling and other utilities - line_profiler \ - bokeh \ - numba \ - datashader \ - # b/328788268: libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'" - "libpysal==4.9.2" \ - # b/276344496: Install specific version of boto3, because 1.26.103 is broken. - "boto3==1.26.100" \ - Boruta && \ - # Pandoc is a dependency of deap - apt-get install -y pandoc && \ - /tmp/clean-layer.sh + vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe RUN apt-get install -y git-lfs && \ - # vtk with dependencies + # vtk dependencies apt-get install -y libgl1-mesa-glx && \ - pip install vtk && \ - # xvfbwrapper with dependencies + # xvfbwrapper dependencies apt-get install -y xvfb && \ - pip install xvfbwrapper && \ - /tmp/clean-layer.sh - -RUN pip install mpld3 \ - gpxpy \ - arrow \ - nilearn \ - nibabel \ - imgaug \ - preprocessing \ - path.py && \ - pip install deap \ - # b/302136621: Fix eli5 import for learntools, newer version require scikit-learn > 1.3 - "tpot==0.12.1" \ - scikit-optimize \ - haversine \ - toolz cytoolz \ - plotly \ - hyperopt \ - langid \ - # Useful data exploration libraries (for missing data and generating reports) - missingno \ - pandas-profiling \ - bayesian-optimization \ - matplotlib-venn \ - pyldavis \ - mlxtend \ - altair \ - ImageHash \ - ecos \ - CVXcanon \ - pymc3 \ - tifffile \ - geojson \ - pydicom \ - wavio \ - SimpleITK \ - squarify \ - fuzzywuzzy \ - python-louvain \ - pyexcel-ods \ - sklearn-pandas \ - prophet \ - holidays \ - holoviews \ - scikit-multilearn \ - leven \ - catboost \ - folium \ - scikit-plot \ - fury dipy \ - plotnine \ - scikit-surprise \ - pymongo \ - eli5 \ - kaggle \ - kagglehub \ - google-generativeai \ - pytest && \ - /tmp/clean-layer.sh - - # Add google PAIR-code Facets -RUN cd /opt/ && git clone https://github.com/PAIR-code/facets && cd facets/ && jupyter nbextension install facets-dist/ --user && \ - export PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ && \ - pip install librosa \ - sentencepiece \ - cufflinks \ - lime \ - memory_profiler && \ - /tmp/clean-layer.sh - -RUN pip install annoy \ - category_encoders && \ - # b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. - pip uninstall -y google-cloud-bigquery-storage && \ - # google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1 - # After launch this should be installed from pip - pip install git+https://github.com/googleapis/python-aiplatform.git@mb-release \ - google-cloud-automl==1.0.1 \ - google-api-core==1.33.2 \ - google-cloud-bigquery \ - google-cloud-storage && \ - # Split these installations to avoid `pip._vendor.resolvelib.resolvers.ResolutionTooDeep: 200000` - # b/315753846: Unpin translate package. - pip install google-cloud-translate==3.12.1 \ - google-cloud-language==2.* \ - google-cloud-videointelligence==2.* \ - google-cloud-vision==2.* \ - protobuf==3.20.3 \ - # Pandas data reader - pandas-datareader \ - emoji \ - # Add Japanese morphological analysis engine - janome \ - # yellowbrick machine learning visualization library - yellowbrick \ - mlcrate && \ - /tmp/clean-layer.sh - -# b/273059949: The pre-installed nbconvert is slow on html conversions and has to be force-uninstalled. -# b/274619697: learntools also requires a specific nbconvert right now -RUN rm -rf /opt/conda/lib/python3.10/site-packages/{nbconvert,nbclient,mistune,platformdirs}* - -RUN pip install bleach \ - certifi \ - cycler \ - decorator \ - entrypoints \ - html5lib \ - ipykernel \ - ipython \ - ipython-genutils \ - # Fix qgrid by pinning ipywidgets https://github.com/quantopian/qgrid/issues/376 - ipywidgets==7.7.1 \ - isoweek \ - jedi \ - jsonschema \ - jupyter-client \ - jupyter-console \ - jupyter-core \ - jupyterlab-lsp \ - MarkupSafe \ - mistune \ - nbformat \ - notebook \ - "nbconvert==6.4.5" \ - papermill \ - python-lsp-server[all] \ - olefile \ - kornia \ - pandas_summary \ - pandocfilters \ - pexpect \ - pickleshare \ - Pillow && \ - # Install openslide and its python binding - apt-get install -y openslide-tools && \ - pip install openslide-python \ - ptyprocess \ - Pygments \ - pyparsing \ - pytz \ - PyYAML \ - pyzmq \ - qtconsole \ - six \ - terminado \ - tornado \ - tqdm \ - traitlets \ - wcwidth \ - webencodings \ - widgetsnbextension \ - # Require pyarrow newer than https://github.com/advisories/GHSA-5wvp-7f3h-6wmm - {{ if eq .Accelerator "gpu" }} pyarrow {{ else }} "pyarrow>=14.0.1" {{ end }} - -RUN python -m spacy download en_core_web_sm && python -m spacy download en_core_web_lg && \ - apt-get update && apt-get install -y ffmpeg && \ - /tmp/clean-layer.sh - - ########### - # - # NEW CONTRIBUTORS: - # Please add new pip/apt installs in this block. Don't forget a "&& \" at the end - # of all non-final lines. Thanks! - # - ########### - -RUN rm /opt/conda/lib/python3.10/site-packages/google*/direct_url.json && \ - rm /opt/conda/lib/python3.10/site-packages/google*/REQUESTED -# dlib has a libmkl incompatibility: -# test_dlib_face_detector (test_dlib.TestDLib) ... INTEL MKL ERROR: /opt/conda/bin/../lib/libmkl_avx512.so.2: undefined symbol: mkl_sparse_optimize_bsr_trsm_i8. -# Intel MKL FATAL ERROR: Cannot load libmkl_avx512.so.2 or libmkl_def.so.2. -# nnabla breaks protobuf compatibiilty: -RUN pip install wandb \ - pyemd \ - pympler \ - featuretools \ - #-e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper \ - git+https://github.com/Kaggle/learntools \ - ray \ - gym \ - pyarabic \ - pandasql \ - # b/302136621: Fix eli5 import for learntools - scikit-learn==1.2.2 \ - # b/329869023 shap 0.45.0 breaks learntools - shap==0.44.1 \ - cesium \ - rgf_python \ - jieba \ - tsfresh \ - optuna \ - plotly_express \ - albumentations \ - Rtree \ - accelerate && \ - apt-get -y install libspatialindex-dev && \ - # b/370860329: newer versions are not capable with current tensorflow - rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ - pip install "numpy==1.26.4" && \ - pip install pytorch-ignite \ - qgrid \ - bqplot \ - earthengine-api \ - transformers \ - datasets \ - s3fs \ - gcsfs \ - kaggle-environments \ - # geopandas > v0.14.4 breaks learn tools - geopandas==v0.14.4 \ - "shapely<2" \ - pydub \ - pydegensac \ - torchmetrics \ - pytorch-lightning \ - sympy \ - # flask is used by agents in the simulation competitions. - flask \ - # pycrypto is used by competitions team. - pycryptodome \ - nbdev \ - easyocr \ - onnx \ - tables \ - openpyxl \ - timm \ - torchinfo && \ - pip install git+https://github.com/facebookresearch/segment-anything.git && \ - # b/370860329: newer versions are not capable with current tensorflow - pip install --no-dependencies fastai fastdownload && \ - # b/343971718: remove duplicate aiohttp installs, and reinstall it - rm -rf /opt/conda/lib/python3.10/site-packages/aiohttp* && \ - micromamba install --force-reinstall -y aiohttp && \ /tmp/clean-layer.sh # Download base easyocr models. @@ -531,84 +103,47 @@ RUN mkdir -p /root/.EasyOCR/model && \ # Tesseract and some associated utility packages RUN apt-get install tesseract-ocr -y && \ - pip install pytesseract \ - wand \ - pdf2image \ - PyPDF && \ /tmp/clean-layer.sh ENV TESSERACT_PATH=/usr/bin/tesseract \ - # For Facets - PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ \ + # For Facets, we also include an empty path to include $PWD. + PYTHONPATH=:$PYTHONPATH:/opt/facets/facets_overview/python/ \ # For Theano with MKL MKL_THREADING_LAYER=GNU # Temporary fixes and patches -# Temporary patch for Dask getting downgraded, which breaks Keras -RUN pip install --upgrade dask && \ - # Stop jupyter nbconvert trying to rewrite its folder hierarchy - mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \ +# Stop jupyter nbconvert trying to rewrite its folder hierarchy +RUN mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \ mkdir -p /.jupyter && touch /.jupyter/jupyter_nbconvert_config.py && touch /.jupyter/migrated && \ - # Stop Matplotlib printing junk to the console on first load - sed -i "s/^.*Matplotlib is building the font cache using fc-list.*$/# Warning removed by Kaggle/g" /opt/conda/lib/python3.10/site-packages/matplotlib/font_manager.py && \ # Make matplotlib output in Jupyter notebooks display correctly mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py && \ - # Temporary patch for broken libpixman 0.38 in conda-forge, symlink to system libpixman 0.34 untile conda package gets updated to 0.38.5 or higher. - ln -sf /usr/lib/x86_64-linux-gnu/libpixman-1.so.0.34.0 /opt/conda/lib/libpixman-1.so.0.38.0 && \ - # b/333854354: pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354) - pip install --force-reinstall --no-deps jupyter_server==2.12.5 && \ /tmp/clean-layer.sh -# Fix to import bq_helper library without downgrading setuptools -RUN mkdir -p ~/src && git clone https://github.com/SohierDane/BigQuery_Helper ~/src/BigQuery_Helper && \ - mkdir -p ~/src/BigQuery_Helper/bq_helper && \ - mv ~/src/BigQuery_Helper/bq_helper.py ~/src/BigQuery_Helper/bq_helper/__init__.py && \ - mv ~/src/BigQuery_Helper/test_helper.py ~/src/BigQuery_Helper/bq_helper/ && \ - sed -i 's/)/packages=["bq_helper"])/g' ~/src/BigQuery_Helper/setup.py && \ - pip install -e ~/src/BigQuery_Helper && \ +# install imagemagick for wand +# https://docs.wand-py.org/en/latest/guide/install.html#install-imagemagick-on-debian-ubuntu +RUN apt-get install libmagickwand-dev && \ /tmp/clean-layer.sh -# Add BigQuery client proxy settings -ENV PYTHONUSERBASE "/root/.local" -ADD patches/kaggle_gcp.py \ - patches/kaggle_secrets.py \ - patches/kaggle_session.py \ - patches/kaggle_web_client.py \ - patches/kaggle_datasets.py \ - patches/log.py \ - patches/sitecustomize.py \ - /root/.local/lib/python3.10/site-packages/ - # Override default imagemagick policies ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml # Add Kaggle module resolver -ADD patches/kaggle_module_resolver.py /opt/conda/lib/python3.10/site-packages/tensorflow_hub/kaggle_module_resolver.py -RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' /opt/conda/lib/python3.10/site-packages/tensorflow_hub/config.py && \ - sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' /opt/conda/lib/python3.10/site-packages/tensorflow_hub/config.py && \ - # Disable preloaded jupyter modules (they add to startup, and break when they are missing) - sed -i /bq_stats/d /etc/ipython/ipython_kernel_config.py && \ - sed -i /beatrix/d /etc/ipython/ipython_kernel_config.py && \ - sed -i /bigquery/d /etc/ipython/ipython_kernel_config.py && \ - sed -i /sql/d /etc/ipython/ipython_kernel_config.py +ADD patches/kaggle_module_resolver.py $PACKAGE_PATH/tensorflow_hub/kaggle_module_resolver.py +RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' $PACKAGE_PATH/tensorflow_hub/config.py && \ + sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' $PACKAGE_PATH/tensorflow_hub/config.py -# Force only one libcusolver -{{ if eq .Accelerator "gpu" }} -RUN rm /opt/conda/bin/../lib/libcusolver.so.11 && ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/bin/../lib/libcusolver.so.11 -{{ else }} -RUN ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/bin/../lib/libcusolver.so.11 -{{ end }} - -# b/270147159: conda ships with a version of libtinfo which is missing version info causing warnings, replace it with a good version. -RUN rm /opt/conda/lib/libtinfo.so.6 && ln -s /usr/lib/x86_64-linux-gnu/libtinfo.so.6 /opt/conda/lib/libtinfo.so.6 && \ - # b/276358430: fix Jupyter lsp freezing up the jupyter server - pip install "jupyter-lsp==1.5.1" +# Add BigQuery client proxy settings +ENV PYTHONUSERBASE="/root/.local" +ADD patches/kaggle_gcp.py \ + patches/kaggle_secrets.py \ + patches/kaggle_session.py \ + patches/kaggle_web_client.py \ + patches/kaggle_datasets.py \ + $PACKAGE_PATH/ -# Set backend for matplotlib -ENV MPLBACKEND="agg" \ - # Set LC_ALL - # https://github.com/explosion/spaCy/issues/12872#issuecomment-1661847588 - LC_ALL="POSIX" +# Figure out why this is in a different place? +# Found by doing a export PYTHONVERBOSE=1 and then running python and checking for where it looked for it. +ADD patches/sitecustomize.py /usr/lib/python3.12/sitecustomize.py ARG GIT_COMMIT=unknown \ BUILD_DATE=unknown @@ -619,16 +154,11 @@ LABEL git-commit=$GIT_COMMIT \ ENV GIT_COMMIT=${GIT_COMMIT} \ BUILD_DATE=${BUILD_DATE} -LABEL tensorflow-version=$TENSORFLOW_VERSION \ - # Used in the Jenkins `Docker GPU Build` step to restrict the images being pruned. - kaggle-lang=python - # Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`. RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date {{ if eq .Accelerator "gpu" }} -# Remove the CUDA stubs. -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH_NO_STUBS" \ - # Add the CUDA home. - CUDA_HOME=/usr/local/cuda +# Add the CUDA home. +ENV CUDA_HOME=/usr/local/cuda {{ end }} +ENTRYPOINT ["/usr/bin/env"] diff --git a/Jenkinsfile b/Jenkinsfile index 93f4753d..c4af03e6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -21,66 +21,6 @@ pipeline { } stages { - stage('Pre-build Packages from Source') { - parallel { - stage('torch') { - options { - timeout(time: 300, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - source config.txt - cd packages/ - ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \ - --package torch \ - --version $TORCH_VERSION \ - --build-arg TORCHAUDIO_VERSION=$TORCHAUDIO_VERSION \ - --build-arg TORCHVISION_VERSION=$TORCHVISION_VERSION \ - --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \ - --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \ - --push - ''' - } - } - stage('lightgbm') { - options { - timeout(time: 10, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - source config.txt - cd packages/ - ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \ - --package lightgbm \ - --version $LIGHTGBM_VERSION \ - --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \ - --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \ - --push - ''' - } - } - stage('jaxlib') { - options { - timeout(time: 300, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - source config.txt - cd packages/ - ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \ - --package jaxlib \ - --version $JAX_VERSION \ - --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \ - --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \ - --push - ''' - } - } - } - } stage('Build/Test/Diff') { parallel { stage('CPU') { @@ -98,22 +38,6 @@ pipeline { ''' } } - stage('Test CPU Image') { - options { - timeout(time: 15, unit: 'MINUTES') - } - steps { - retry(2) { - sh '''#!/bin/bash - set -exo pipefail - - date - docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} - ./test --image gcr.io/kaggle-images/python:${PRETEST_TAG} - ''' - } - } - } stage('Diff CPU image') { steps { sh '''#!/bin/bash @@ -150,44 +74,6 @@ pipeline { ''' } } - stage('Test GPU Image') { - stages { - stage('Test on P100') { - agent { label 'ephemeral-linux-gpu' } - options { - timeout(time: 40, unit: 'MINUTES') - } - steps { - retry(2) { - sh '''#!/bin/bash - set -exo pipefail - - date - docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ''' - } - } - } - stage('Test on T4x2') { - agent { label 'ephemeral-linux-gpu-t4x2' } - options { - timeout(time: 60, unit: 'MINUTES') - } - steps { - retry(2) { - sh '''#!/bin/bash - set -exo pipefail - - date - docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ''' - } - } - } - } - } stage('Diff GPU Image') { steps { sh '''#!/bin/bash @@ -201,6 +87,7 @@ pipeline { } } stage('TPU VM') { + agent { label 'ephemeral-linux' } stages { stage('Build TPU VM Image') { options { @@ -230,6 +117,61 @@ pipeline { } } + stage('Test') { + parallel { + stage('Test CPU Image') { + options { + timeout(time: 15, unit: 'MINUTES') + } + steps { + retry(2) { + sh '''#!/bin/bash + set -exo pipefail + + date + docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} + ./test --image gcr.io/kaggle-images/python:${PRETEST_TAG} + ''' + } + } + } + stage('Test on P100') { + agent { label 'ephemeral-linux-gpu' } + options { + timeout(time: 40, unit: 'MINUTES') + } + steps { + retry(2) { + sh '''#!/bin/bash + set -exo pipefail + + date + docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ''' + } + } + } + stage('Test on T4x2') { + agent { label 'ephemeral-linux-gpu-t4x2' } + options { + timeout(time: 60, unit: 'MINUTES') + } + steps { + retry(2) { + sh '''#!/bin/bash + set -exo pipefail + + date + docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ''' + } + } + } + } + } + stage('Label CPU/GPU Staging Images') { steps { sh '''#!/bin/bash @@ -245,13 +187,13 @@ pipeline { post { failure { - mattermostSend color: 'danger', message: "*<${env.BUILD_URL}console|${JOB_NAME} failed>* ${GIT_COMMIT_SUMMARY} @kernels-backend-ops", channel: env.MATTERMOST_CHANNEL + mattermostSend color: 'danger', message: "*<${env.BUILD_URL}console|${JOB_NAME} failed>* ${GIT_COMMIT_SUMMARY} @dockerops", channel: env.MATTERMOST_CHANNEL } success { - mattermostSend color: 'good', message: "*<${env.BUILD_URL}console|${JOB_NAME} passed>* ${GIT_COMMIT_SUMMARY} @kernels-backend-ops", channel: env.MATTERMOST_CHANNEL + mattermostSend color: 'good', message: "*<${env.BUILD_URL}console|${JOB_NAME} passed>* ${GIT_COMMIT_SUMMARY} @dockerops", channel: env.MATTERMOST_CHANNEL } aborted { - mattermostSend color: 'warning', message: "*<${env.BUILD_URL}console|${JOB_NAME} aborted>* ${GIT_COMMIT_SUMMARY} @kernels-backend-ops", channel: env.MATTERMOST_CHANNEL + mattermostSend color: 'warning', message: "*<${env.BUILD_URL}console|${JOB_NAME} aborted>* ${GIT_COMMIT_SUMMARY} @dockerops", channel: env.MATTERMOST_CHANNEL } } } diff --git a/README.md b/README.md index 387dcf89..315e7db2 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ If you the first step above doesn't work for your use case, [open an issue](http ## Opening a pull request -1. Edit the [Dockerfile](Dockerfile.tmpl). +1. Edit [kaggle_requirements.txt](kaggle_requirements.txt). 1. Follow the instructions below to build a new image. 1. Add tests for your new package. See this [example](https://github.com/Kaggle/docker-python/blob/main/tests/test_fastai.py). 1. Follow the instructions below to test the new image. diff --git a/build b/build index 9b20f2dc..83bbe577 100755 --- a/build +++ b/build @@ -47,18 +47,13 @@ done BUILD_ARGS+=" --build-arg GIT_COMMIT=$(git rev-parse HEAD)" BUILD_ARGS+=" --build-arg BUILD_DATE=$(date '+%Y%m%d-%H%M%S')" -# Read build args from config.txt file. -SRCDIR=$(dirname "${BASH_SOURCE[0]}") -for l in `cat ${SRCDIR}/config.txt`; do - BUILD_ARGS+=" --build-arg $l" -done - readonly CACHE_FLAG readonly DOCKERFILE readonly ACCELERATOR readonly IMAGE_TAG readonly BUILD_ARGS +SRCDIR=$(dirname "${BASH_SOURCE[0]}") DOCKERFILE_OUTDIR="${SRCDIR}/.generated" mkdir -p $DOCKERFILE_OUTDIR DOCKERFILE_PATH="$DOCKERFILE_OUTDIR/$DOCKERFILE" diff --git a/clean-layer.sh b/clean-layer.sh index d1a048fc..9a50e7bf 100755 --- a/clean-layer.sh +++ b/clean-layer.sh @@ -10,8 +10,6 @@ set -e set -x -# Delete files that pip caches when installing a package. -rm -rf /root/.cache/pip/* # Delete old downloaded archive files apt-get autoremove -y # Delete downloaded archive files @@ -19,6 +17,4 @@ apt-get clean # Ensures the current working directory won't be deleted cd /usr/local/src/ # Delete source files used for building binaries -rm -rf /usr/local/src/* -# Delete conda downloaded tarballs -conda clean -y --tarballs +rm -rf /usr/local/src/* \ No newline at end of file diff --git a/config.txt b/config.txt deleted file mode 100644 index e95a1af1..00000000 --- a/config.txt +++ /dev/null @@ -1,11 +0,0 @@ -BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release -BASE_IMAGE_TAG=m122 -CPU_BASE_IMAGE_NAME=tf2-cpu.2-16.py310 -GPU_BASE_IMAGE_NAME=tf2-gpu.2-16.py310 -LIGHTGBM_VERSION=4.2.0 -TORCH_VERSION=2.4.0 -TORCHAUDIO_VERSION=2.4.0 -TORCHVISION_VERSION=0.19.0 -JAX_VERSION=0.4.26 -CUDA_MAJOR_VERSION=12 -CUDA_MINOR_VERSION=3 diff --git a/diff b/diff index c0eb2e18..c8251703 100755 --- a/diff +++ b/diff @@ -104,7 +104,7 @@ fi for cmd in "${CMDS[@]}"; do echo "== Comparing $cmd ==" diff --suppress-common-lines --side-by-side \ - <(docker run -v $PWD/tools:/tools --rm "$BASE_IMAGE_TAG" /bin/bash -c "$cmd") \ - <(docker run -v $PWD/tools:/tools --rm "$TARGET_IMAGE_TAG" /bin/bash -c "$cmd") \ + <(docker run -v $PWD/tools:/tools --entrypoint bash --rm "$BASE_IMAGE_TAG" -c "$cmd") \ + <(docker run -v $PWD/tools:/tools --entrypoint bash --rm "$TARGET_IMAGE_TAG" -c "$cmd") \ && echo 'No diff' || true done diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt new file mode 100644 index 00000000..30e0683f --- /dev/null +++ b/kaggle_requirements.txt @@ -0,0 +1,128 @@ +# Please keep this in alphabetical order +Boruta +Cartopy +ImageHash +Janome +PyArabic +PyUpSet +Pympler +Rtree +shapely +SimpleITK +TPOT +Wand +bayesian-optimization +boto3 +catboost +category-encoders +cesium +comm +cytoolz +# Older versions of datasets fail with "Loading a dataset cached in a LocalFileSystem is not supported" +# https://stackoverflow.com/questions/77433096/notimplementederror-loading-a-dataset-cached-in-a-localfilesystem-is-not-suppor +datasets>=2.14.6 +deap +dipy +docker +easyocr +emoji +fasttext +featuretools +fiona +fury +fuzzywuzzy +geojson +gensim +# b/443054743,b/455550872 +google-adk[a2a,eval]>=1.21.0 +google-cloud-aiplatform +google-cloud-videointelligence +google-cloud-vision +google-genai +gpxpy +h2o +haversine +hep-ml +igraph +ipympl +ipywidgets==8.1.5 +isoweek +jedi +# jitler 0.11.1 breaks simulation image +jiter==0.10.0 +# b/276358430: fix Jupyter lsp freezing up the jupyter server +jupyter-lsp==1.5.1 +# b/333854354: pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354) +jupyter_server==2.12.5 +jupyter_server_proxy +jupyterlab +jupyterlab-lsp +kaggle>=1.8.3 +kaggle-environments +kagglehub[pandas-datasets,hf-datasets,signing]>=0.4.2 +keras-cv +keras-nlp +keras-tuner +kornia +langid +libpysal +lime +line_profiler +mamba +matplotlib +mlcrate +mne +mpld3 +# b/274619697: learntools requires a specific nbconvert right now +nbconvert==6.4.5 +nbdev +nilearn +olefile +onnx +openslide-bin +openslide-python +optuna +pandas-profiling +pandasql +papermill +path +path.py +pdf2image +plotly-express +pudb +pyLDAvis +pycryptodome +pydicom +pyemd +pyexcel-ods +pymongo +pypdf +pytesseract +python-lsp-server +pytorch-ignite +pytorch-lightning +qgrid +qtconsole +ray +rgf-python +s3fs +scikit-learn +# Scikit-learn accelerated library for x86 +scikit-learn-intelex>=2023.0.1 +scikit-multilearn +scikit-optimize +scikit-plot +scikit-surprise +git+https://github.com/facebookresearch/segment-anything.git +squarify +tensorflow-io +# Must be compatible with torch version: https://github.com/meta-pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec +torchcodec==0.9 +torchinfo +torchmetrics +torchtune +transformers>=5.0.0 +vtk +wavio +xvfbwrapper +ydata-profiling diff --git a/packages/README.md b/packages/README.md deleted file mode 100644 index e69de29b..00000000 diff --git a/packages/build_package b/packages/build_package deleted file mode 100755 index 1e6a7f94..00000000 --- a/packages/build_package +++ /dev/null @@ -1,152 +0,0 @@ -#!/bin/bash -set -e - -usage() { -cat << EOF -Usage: $0 [OPTIONS] -Build a new package ".whl". - -Options: - -p, --package PACKAGE Package to build (e.g. lightgbm). - -v, --version VERSION Package version to build. - -b, --base-image IMAGE Base image tag (e.g. m80). - -c, --use-cache Use layer cache when building a new image. - -f, --force-rebuild Rebuild the image regardless of whether it already exist on GCR. - -u, --push Push image to GCR. - --build-arg ARG=VALUE Build arguments to pass to the docker build command. -EOF -} - -PACKAGE='' -PACKAGE_VERSION='' -BASE_IMAGE='' -DOCKERFILE='' -CACHE_FLAG='--no-cache' -FORCE_REBUILD=false -PUSH_TO_GCR=false -BUILD_ARGS='' - -while :; do - case "$1" in - -h|--help) - usage - exit - ;; - -p|--package) - if [[ -z $2 ]]; then - usage - printf 'ERROR: No IMAGE specified after the %s flag.\n' "$1" >&2 - exit 1 - fi - PACKAGE=$2 - DOCKERFILE="${PACKAGE}.Dockerfile" - shift # skip the flag value - ;; - -v|--version) - if [[ -z $2 ]]; then - usage - printf 'ERROR: No VERSION specified after the %s flag.\n' "$1" >&2 - exit 1 - fi - PACKAGE_VERSION=$2 - shift # skip the flag value - ;; - -t|--base-image) - if [[ -z $2 ]]; then - usage - printf 'ERROR: No TAG specified after the %s flag.\n' "$1" >&2 - exit 1 - fi - BASE_IMAGE=$2 - shift # skip the flag value - ;; - -c|--use-cache) - CACHE_FLAG='' - ;; - -f|--force-rebuild) - FORCE_REBUILD=true - ;; - -u|--push) - PUSH_TO_GCR=true - ;; - --build-arg) - if [[ -z $2 ]]; then - usage - printf 'ERROR: No ARG=VALUE specified after the %s flag.\n' "$1" >&2 - exit 1 - fi - BUILD_ARGS+=" $1 $2" - shift # skip the flag value - ;; - -?*) - usage - printf 'ERROR: Unknown option: %s\n' "$1" >&2 - exit 1 - ;; - *) - break - esac - - shift -done - -readonly PACKAGE -readonly PACKAGE_VERSION -readonly BASE_IMAGE -readonly DOCKERFILE -readonly CACHE_FLAG -readonly FORCE_REBUILD - -SRCDIR=$(dirname "${BASH_SOURCE[0]}") -DOCKERFILE_PATH="$SRCDIR/$DOCKERFILE" - -if [[ -z "$PACKAGE_VERSION" ]]; then - printf 'ERROR: missing --version flag.\n' - exit 1 -fi - -if [[ -z "$BASE_IMAGE" ]]; then - printf 'ERROR: missing --base-image flag.\n' - exit 1 -fi - -if [[ -z "$DOCKERFILE" ]]; then - printf 'ERROR: missing --package flag.\n' - exit 1 -fi - -# Keep only `tf2-gpu.2-6:m80` in `gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80` -TAG=${BASE_IMAGE/gcr.io\/deeplearning-platform-release\//} -# Keep only `python:v108` in `gcr.io/kaggle-images/python:v108` -TAG=${TAG/gcr.io\/kaggle-images\//} -# Replace the `:` in `tf2-gpu.2-6:m80` by `-` -TAG=${TAG/:/-} -# Append the package version -TAG=$TAG-$PACKAGE_VERSION -# Add the gcr repo. -TAG=gcr.io/kaggle-images/python-$PACKAGE-whl:$TAG - -SHOULD_BUILD=true -if ! $FORCE_REBUILD; then - echo "Checking if $TAG exists..." - docker pull $TAG && SHOULD_BUILD=false -fi - -if $SHOULD_BUILD; then - echo "Building $TAG..." - docker build --rm --pull $BUILD_ARGS \ - $CACHE_FLAG \ - -t $TAG \ - -f "$DOCKERFILE_PATH" \ - --build-arg BASE_IMAGE=$BASE_IMAGE \ - --build-arg PACKAGE_VERSION=$PACKAGE_VERSION \ - $SRCDIR - - if $PUSH_TO_GCR; then - echo "Pushing $TAG to GCR..." - docker push $TAG - fi -else - echo "Skipping build. $TAG already exists." - echo "Use --force-rebuild if you want to build a new version anyway." -fi \ No newline at end of file diff --git a/packages/jaxlib.Dockerfile b/packages/jaxlib.Dockerfile deleted file mode 100644 index ed73991c..00000000 --- a/packages/jaxlib.Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -ARG BASE_IMAGE - -FROM ${BASE_IMAGE} AS builder - -ARG PACKAGE_VERSION -ARG CUDA_MAJOR_VERSION -ARG CUDA_MINOR_VERSION - -# Make sure we are on the right version of CUDA -RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION - -# Ensures shared libraries installed with conda can be found by the dynamic link loader. -# For PyTorch, we need specifically mkl. -ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" - -# Instructions: https://jax.readthedocs.io/en/latest/developer.html#building-jaxlib-from-source -RUN sudo ln -s /usr/bin/python3 /usr/bin/python - -RUN apt-get update && \ - apt-get install -y g++ python3 python3-dev - -RUN pip install numpy wheel build - -RUN cd /usr/local/src && \ - git clone https://github.com/google/jax && \ - cd jax && \ - git checkout jaxlib-v$PACKAGE_VERSION - -RUN cd /usr/local/src/jax && \ - python build/build.py --enable_cuda - -# Using multi-stage builds to ensure the output image is very small -# See: https://docs.docker.com/develop/develop-images/multistage-build/ -FROM alpine:latest - -RUN mkdir -p /tmp/whl/ -COPY --from=builder /usr/local/src/jax/dist/*.whl /tmp/whl - -# Print out the built .whl file. -RUN ls -lh /tmp/whl/ \ No newline at end of file diff --git a/packages/lightgbm.Dockerfile b/packages/lightgbm.Dockerfile deleted file mode 100644 index 376eaaef..00000000 --- a/packages/lightgbm.Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -ARG BASE_IMAGE - -FROM ${BASE_IMAGE} AS builder - -ARG PACKAGE_VERSION -ARG CUDA_MAJOR_VERSION -ARG CUDA_MINOR_VERSION - -# Make sure we are on the right version of CUDA -RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION - -# Build instructions: https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm -RUN apt-get update && \ - apt-get install -y build-essential cmake libboost-dev libboost-system-dev libboost-filesystem-dev clinfo nvidia-opencl-dev opencl-headers - -RUN cd /usr/local/src && \ - git clone --recursive https://github.com/microsoft/LightGBM && \ - cd LightGBM && \ - git checkout tags/v$PACKAGE_VERSION && \ - ./build-python.sh bdist_wheel --gpu --opencl-library=/usr/local/cuda/lib64/libOpenCL.so --opencl-include-dir=/usr/local/cuda/include/ - -# Using multi-stage builds to ensure the output image is very small -# See: https://docs.docker.com/develop/develop-images/multistage-build/ -FROM alpine:latest - -RUN mkdir -p /tmp/whl/ -COPY --from=builder /usr/local/src/LightGBM/dist/*.whl /tmp/whl - -# Print out the built .whl file. -RUN ls -lh /tmp/whl/ \ No newline at end of file diff --git a/packages/torch.Dockerfile b/packages/torch.Dockerfile deleted file mode 100644 index 68c1eff3..00000000 --- a/packages/torch.Dockerfile +++ /dev/null @@ -1,86 +0,0 @@ -ARG BASE_IMAGE - -FROM ${BASE_IMAGE} AS builder - -ARG PACKAGE_VERSION -ARG TORCHAUDIO_VERSION -ARG TORCHVISION_VERSION -ARG CUDA_MAJOR_VERSION -ARG CUDA_MINOR_VERSION - -# Make sure we are on the right version of CUDA -RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION - -# TORCHVISION_VERSION is mandatory -RUN test -n "$TORCHVISION_VERSION" - -# Use mamba to speed up conda installs -RUN conda install -c conda-forge mamba - -# Build instructions: https://github.com/pytorch/pytorch#from-source -RUN mamba install astunparse numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses -RUN mamba install -c pytorch magma-cuda121 - -# By default, it uses the version from version.txt which includes the `a0` (alpha zero) suffix and part of the git hash. -# This causes dependency conflicts like these: https://paste.googleplex.com/4786486378496000 -ENV PYTORCH_BUILD_VERSION=$PACKAGE_VERSION -ENV PYTORCH_BUILD_NUMBER=1 - -# Ensures shared libraries installed with conda can be found by the dynamic link loader. -# For PyTorch, we need specifically mkl. -ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" -ENV TORCH_CUDA_ARCH_LIST="6.0;7.0+PTX;7.5+PTX" -ENV FORCE_CUDA=1 -RUN cd /usr/local/src && \ - git clone --recursive https://github.com/pytorch/pytorch && \ - cd pytorch && \ - git checkout tags/v$PACKAGE_VERSION && \ - git submodule sync && \ - git submodule update --init --recursive --jobs 1 && \ - python setup.py bdist_wheel - -# Install torch which is required before we can build other torch* packages. -RUN pip install /usr/local/src/pytorch/dist/*.whl - -# Build torchaudio -# Instructions: https://github.com/pytorch/audio#from-source -# See comment above for PYTORCH_BUILD_VERSION. -ENV BUILD_VERSION=$TORCHAUDIO_VERSION -RUN sudo apt-get update && \ - # ncurses.h is required for this install - sudo apt-get install libncurses-dev && \ - # Fixing the build: https://github.com/pytorch/audio/issues/666#issuecomment-635928685 - mamba install -c conda-forge ncurses && \ - cd /usr/local/src && \ - git clone https://github.com/pytorch/audio && \ - cd audio && \ - git checkout tags/v$TORCHAUDIO_VERSION && \ - git submodule sync && \ - git submodule update --init --recursive --jobs 1 -# https://github.com/pytorch/audio/issues/936#issuecomment-702990346 -RUN sed -i 's/set(envs/set(envs\n "LIBS=-ltinfo"/' /usr/local/src/audio/third_party/sox/CMakeLists.txt -RUN cd /usr/local/src/audio && python setup.py bdist_wheel - -# Build torchvision. -# Instructions: https://github.com/pytorch/vision/tree/main#installation -# See comment above for PYTORCH_BUILD_VERSION. -ENV CUDA_HOME=/usr/local/cuda -ENV BUILD_VERSION=$TORCHVISION_VERSION -RUN cd /usr/local/src && \ - git clone --recursive https://github.com/pytorch/vision && \ - cd vision && \ - git checkout tags/v$TORCHVISION_VERSION && \ - python setup.py bdist_wheel - -# Using multi-stage builds to ensure the output image is very small -# See: https://docs.docker.com/develop/develop-images/multistage-build/ -FROM alpine:latest - -RUN mkdir -p /tmp/whl/ -COPY --from=builder /usr/local/src/pytorch/dist/*.whl /tmp/whl -COPY --from=builder /usr/local/src/audio/dist/*.whl /tmp/whl -COPY --from=builder /usr/local/src/vision/dist/*.whl /tmp/whl - -# Print out the built .whl file. -RUN ls -lh /tmp/whl/ diff --git a/patches/kaggle_gcp.py b/patches/kaggle_gcp.py index 2c8b64cc..4cb98858 100644 --- a/patches/kaggle_gcp.py +++ b/patches/kaggle_gcp.py @@ -1,5 +1,6 @@ import os import inspect +import logging from google.auth import credentials, environment_vars from google.auth.exceptions import RefreshError from google.api_core.gapic_v1.client_info import ClientInfo @@ -8,8 +9,6 @@ from google.cloud.bigquery._http import Connection from kaggle_secrets import GcpTarget, UserSecretsClient -from log import Log - KAGGLE_GCP_CLIENT_USER_AGENT="kaggle-gcp-client/1.0" def get_integrations(): @@ -22,7 +21,7 @@ def get_integrations(): target = GcpTarget[integration.upper()] kernel_integrations.add_integration(target) except KeyError as e: - Log.error(f"Unknown integration target: {integration.upper()}") + logging.debug(f"Unknown integration target: {integration.upper()}") return kernel_integrations @@ -66,14 +65,14 @@ def refresh(self, request): elif self.target == GcpTarget.CLOUDAI: self.token, self.expiry = client._get_cloudai_access_token() except ConnectionError as e: - Log.error(f"Connection error trying to refresh access token: {e}") + logging.error(f"Connection error trying to refresh access token: {e}") print("There was a connection error trying to fetch the access token. " f"Please ensure internet is on in order to use the {self.target.service} Integration.") raise RefreshError('Unable to refresh access token due to connection error.') from e except Exception as e: - Log.error(f"Error trying to refresh access token: {e}") + logging.error(f"Error trying to refresh access token: {e}") if (not get_integrations().has_integration(self.target)): - Log.error(f"No {self.target.service} integration found.") + logging.error(f"No {self.target.service} integration found.") print( f"Please ensure you have selected a {self.target.service} account in the Notebook Add-ons menu.") raise RefreshError('Unable to refresh access token.') from e @@ -102,7 +101,7 @@ def api_request(self, *args, **kwargs): msg = ("Permission denied using Kaggle's public BigQuery integration. " "Did you mean to select a BigQuery account in the Notebook Add-ons menu?") print(msg) - Log.info(msg) + logging.info(msg) raise e @@ -156,23 +155,23 @@ def monkeypatch_bq(bq_client, *args, **kwargs): # Remove these two lines once this is resolved: # https://github.com/googleapis/google-cloud-python/issues/8108 if explicit_project_id: - Log.info(f"Explicit project set to {explicit_project_id}") + logging.info(f"Explicit project set to {explicit_project_id}") kwargs['project'] = explicit_project_id if explicit_project_id is None and specified_credentials is None and not has_bigquery: msg = "Using Kaggle's public dataset BigQuery integration." - Log.info(msg) + logging.info(msg) print(msg) return PublicBigqueryClient(*args, **kwargs) else: if specified_credentials is None: - Log.info("No credentials specified, using KaggleKernelCredentials.") + logging.info("No credentials specified, using KaggleKernelCredentials.") kwargs['credentials'] = KaggleKernelCredentials() if (not has_bigquery): - Log.info("No bigquery integration found, creating client anyways.") + logging.info("No bigquery integration found, creating client anyways.") print('Please ensure you have selected a BigQuery ' 'account in the Notebook Add-ons menu.') if explicit_project_id is None: - Log.info("No project specified while using the unmodified client.") + logging.info("No project specified while using the unmodified client.") print('Please ensure you specify a project id when creating the client' ' in order to use your BigQuery account.') kwargs['client_info'] = set_kaggle_user_agent(kwargs.get('client_info')) @@ -196,20 +195,20 @@ def monkeypatch_aiplatform_init(aiplatform_klass, kaggle_kernel_credentials): def patched_init(*args, **kwargs): specified_credentials = kwargs.get('credentials') if specified_credentials is None: - Log.info("No credentials specified, using KaggleKernelCredentials.") + logging.info("No credentials specified, using KaggleKernelCredentials.") kwargs['credentials'] = kaggle_kernel_credentials return aiplatform_init(*args, **kwargs) if (not has_been_monkeypatched(aiplatform_klass.init)): aiplatform_klass.init = patched_init - Log.info("aiplatform.init patched") + logging.info("aiplatform.init patched") def monkeypatch_client(client_klass, kaggle_kernel_credentials): client_init = client_klass.__init__ def patched_init(self, *args, **kwargs): specified_credentials = kwargs.get('credentials') if specified_credentials is None: - Log.info("No credentials specified, using KaggleKernelCredentials.") + logging.info("No credentials specified, using KaggleKernelCredentials.") # Some GCP services demand the billing and target project must be the same. # To avoid using default service account based credential as caller credential # user need to provide ClientOptions with quota_project_id: @@ -227,7 +226,7 @@ def patched_init(self, *args, **kwargs): if (not has_been_monkeypatched(client_klass.__init__)): client_klass.__init__ = patched_init - Log.info(f"Client patched: {client_klass}") + logging.info(f"Client patched: {client_klass}") def set_kaggle_user_agent(client_info: ClientInfo): # Add kaggle client user agent in order to attribute usage. @@ -253,37 +252,6 @@ def init_gcs(): KaggleKernelCredentials(target=GcpTarget.GCS)) return storage -def init_automl(): - from google.cloud import automl, automl_v1beta1 - if not is_user_secrets_token_set(): - return - - from kaggle_gcp import get_integrations - if not get_integrations().has_cloudai(): - return - - from kaggle_secrets import GcpTarget - from kaggle_gcp import KaggleKernelCredentials - kaggle_kernel_credentials = KaggleKernelCredentials(target=GcpTarget.CLOUDAI) - - # Patch the 2 GA clients: AutoMlClient and PreditionServiceClient - monkeypatch_client(automl.AutoMlClient, kaggle_kernel_credentials) - monkeypatch_client(automl.PredictionServiceClient, kaggle_kernel_credentials) - - # The AutoML client library exposes 3 different client classes (AutoMlClient, - # TablesClient, PredictionServiceClient), so patch each of them. - # The same KaggleKernelCredentials are passed to all of them. - # The GcsClient class is only used internally by TablesClient. - - # The beta version of the clients that are now GA are included here for now. - # They are deprecated and will be removed by 1 May 2020. - monkeypatch_client(automl_v1beta1.AutoMlClient, kaggle_kernel_credentials) - monkeypatch_client(automl_v1beta1.PredictionServiceClient, kaggle_kernel_credentials) - - # The TablesClient is still in beta, so this will not be deprecated until - # the TablesClient is GA. - monkeypatch_client(automl_v1beta1.TablesClient, kaggle_kernel_credentials) - def init_translation_v2(): from google.cloud import translate_v2 if not is_user_secrets_token_set(): @@ -379,7 +347,6 @@ def init_vision(): def init(): init_bigquery() init_gcs() - init_automl() init_translation_v2() init_translation_v3() init_natural_language() @@ -392,4 +359,4 @@ def init(): # google.cloud.* and kaggle_gcp. By calling init here, we guarantee # that regardless of the original import that caused google.cloud.* to be # loaded, the monkeypatching will be done. -init() +init() \ No newline at end of file diff --git a/patches/log.py b/patches/log.py deleted file mode 100644 index 59a07c8c..00000000 --- a/patches/log.py +++ /dev/null @@ -1,133 +0,0 @@ -import io -import logging -import os - -import google.auth - - -_LOG_TO_FILE_ENV = os.getenv("KAGGLE_LOG_TO_FILE") - - -class _LogFormatter(logging.Formatter): - """A logging formatter which truncates long messages.""" - - _MAX_LOG_LENGTH = 10000 # Be generous, not to truncate long backtraces. - - def format(self, record): - msg = super(_LogFormatter, self).format(record) - return msg[:_LogFormatter._MAX_LOG_LENGTH] if msg else msg - -# TODO(vimota): Clean this up once we're using python 3.8 and can use -# (https://github.com/python/cpython/commit/dde9fdbe453925279ac3d2a6a72102f6f9ef247c) -# Right now, making the logging module display the intended frame's information -# when the logging calls (info, warn, ...) are wrapped (as is the case in our -# Log class) involves fragile logic. -class _Logger(logging.Logger): - - # This is a copy of logging.Logger.findCaller with the filename ignore - # set expanded to include the current filename (".../log.py"). - # Copyright 2001-2015 by Vinay Sajip. All Rights Reserved. - # License: https://github.com/python/cpython/blob/ce9e62544571e7ade7186697d5dd065fb4c5243f/LICENSE - def findCaller(self, stack_info=False, stacklevel=1): - f = logging.currentframe() - f = f.f_back - rv = "(unknown file)", 0, "(unknown function)", None - while hasattr(f, "f_code"): - co = f.f_code - filename = os.path.normcase(co.co_filename) - if filename in _ignore_srcfiles: - f = f.f_back - continue - sinfo = None - if stack_info: - sio = io.StringIO() - sio.write('Stack (most recent call last):\n') - traceback.print_stack(f, file=sio) - sinfo = sio.getvalue() - if sinfo[-1] == '\n': - sinfo = sinfo[:-1] - sio.close() - rv = (co.co_filename, f.f_lineno, co.co_name, sinfo) - break - return rv - - -_srcfile = os.path.normcase(_Logger.findCaller.__code__.co_filename) -_ignore_srcfiles = (_srcfile, logging._srcfile) - -class Log: - """ Helper aggregate for all things related to logging activity. """ - - _GLOBAL_LOG = logging.getLogger("") - _initialized = False - - # These are convenience helpers. For performance, consider saving Log.get_logger() and using that - @staticmethod - def critical(msg, *args, **kwargs): - Log._GLOBAL_LOG.critical(msg, *args, **kwargs) - - @staticmethod - def fatal(msg, *args, **kwargs): - Log._GLOBAL_LOG.fatal(msg, *args, **kwargs) - - @staticmethod - def exception(msg, *args, **kwargs): - Log._GLOBAL_LOG.exception(msg, *args, **kwargs) - - @staticmethod - def error(msg, *args, **kwargs): - Log._GLOBAL_LOG.error(msg, *args, **kwargs) - - @staticmethod - def warn(msg, *args, **kwargs): - Log._GLOBAL_LOG.warn(msg, *args, **kwargs) - - @staticmethod - def warning(msg, *args, **kwargs): - Log._GLOBAL_LOG.warning(msg, *args, **kwargs) - - @staticmethod - def debug(msg, *args, **kwargs): - Log._GLOBAL_LOG.debug(msg, *args, **kwargs) - - @staticmethod - def info(msg, *args, **kwargs): - Log._GLOBAL_LOG.info(msg, *args, **kwargs) - - @staticmethod - def set_level(loglevel): - if isinstance(loglevel, int): - Log._GLOBAL_LOG.setLevel(loglevel) - return - elif isinstance(loglevel, str): - # idea from https://docs.python.org/3.5/howto/logging.html#logging-to-a-file - numeric_level = getattr(logging, loglevel.upper(), None) - if isinstance(numeric_level, int): - Log._GLOBAL_LOG.setLevel(numeric_level) - return - - raise ValueError('Invalid log level: %s' % loglevel) - - @staticmethod - def _static_init(): - if Log._initialized: - return - - logging.setLoggerClass(_Logger) - # The root logger's type is unfortunately (and surprisingly) not affected by - # `setLoggerClass`. Monkey patch it instead. TODO(vimota): Remove this, see the TODO - # associated with _Logger. - logging.RootLogger.findCaller = _Logger.findCaller - log_to_file = _LOG_TO_FILE_ENV.lower() in ("yes", "true", "t", "1") if _LOG_TO_FILE_ENV is not None else True - if log_to_file: - handler = logging.FileHandler(filename='/tmp/kaggle.log', mode='w') - else: - handler = logging.StreamHandler() - - # ".1s" is for the first letter: http://stackoverflow.com/a/27453084/1869. - format_string = "%(asctime)s %(levelname).1s %(process)d %(filename)s:%(lineno)d] %(message)s" - handler.setFormatter(_LogFormatter(format_string)) - logging.basicConfig(level=logging.INFO, handlers=[handler]) - Log._initialized = True - -Log._static_init() diff --git a/patches/sitecustomize.py b/patches/sitecustomize.py index ea47698b..1bb8a1b6 100644 --- a/patches/sitecustomize.py +++ b/patches/sitecustomize.py @@ -1,7 +1,6 @@ +import logging import os -from log import Log - import sys import importlib.abc import importlib @@ -13,7 +12,6 @@ class GcpModuleFinder(importlib.abc.MetaPathFinder): _MODULES = [ 'google.cloud.bigquery', 'google.cloud.storage', - 'google.cloud.automl_v1beta1', 'google.cloud.translate', 'google.cloud.translate_v2', 'google.cloud.translate_v3', @@ -56,7 +54,6 @@ def create_module(self, spec): _LOADERS = { 'google.cloud.bigquery': kaggle_gcp.init_bigquery, 'google.cloud.storage': kaggle_gcp.init_gcs, - 'google.cloud.automl_v1beta1': kaggle_gcp.init_automl, 'google.cloud.translate': kaggle_gcp.init_translation_v3, 'google.cloud.translate_v2': kaggle_gcp.init_translation_v2, 'google.cloud.translate_v3': kaggle_gcp.init_translation_v3, @@ -117,3 +114,30 @@ def new_configure(*args, **kwargs): module.configure = new_configure module.configure() # generativeai can use GOOGLE_API_KEY env variable, so make sure we have the other configs set + +@wrapt.when_imported('google.genai') +def post_genai_import_logic(module): + if os.getenv('KAGGLE_DISABLE_GOOGLE_GENERATIVE_AI_INTEGRATION'): + return + + if not (os.getenv('KAGGLE_DATA_PROXY_TOKEN') and + os.getenv('KAGGLE_USER_SECRETS_TOKEN') and + os.getenv('KAGGLE_DATA_PROXY_URL')): + return + @wrapt.patch_function_wrapper(module, 'Client.__init__') + def init_wrapper(wrapped, instance, args, kwargs): + # Don't want to forward requests that are to Vertex AI, debug mode, or have their own http_options specified + # Thus, if the client constructor contains any params other than api_key, we don't set up forwarding + if any(value is not None for key, value in kwargs.items() if key != 'api_key'): + return wrapped(*args, **kwargs) + + default_metadata = { + "x-kaggle-proxy-data": os.environ['KAGGLE_DATA_PROXY_TOKEN'], + 'x-kaggle-authorization': f"Bearer {os.environ['KAGGLE_USER_SECRETS_TOKEN']}" + } + http_options = { + 'base_url': os.getenv('KAGGLE_DATA_PROXY_URL') + '/palmapi/', + 'headers': default_metadata + } + kwargs['http_options'] = http_options + return wrapped(*args, **kwargs) diff --git a/test b/test index ef1ffe3e..574b49e3 100755 --- a/test +++ b/test @@ -3,7 +3,7 @@ set -e IMAGE_TAG='kaggle/python-build' IMAGE_TAG_OVERRIDE='' -ADDITONAL_OPTS='' +ADDITONAL_OPTS='--runtime runc ' # Use the CPU runtime by default PATTERN='test*.py' usage() { @@ -28,7 +28,7 @@ while :; do ;; -g|--gpu) IMAGE_TAG='kaggle/python-gpu-build' - ADDITONAL_OPTS='-v /tmp/empty_dir:/usr/local/cuda/lib64/stubs:ro' + ADDITONAL_OPTS='--runtime nvidia -v /tmp/empty_dir:/usr/local/cuda/lib64/stubs:ro' ;; -i|--image) if [[ -z $2 ]]; then @@ -69,8 +69,6 @@ readonly ADDITONAL_OPTS readonly PATTERN set -x -docker run --rm --net=none -v /tmp/python-build:/tmp/python-build "$IMAGE_TAG" rm -rf /tmp/python-build/* -docker rm jupyter_test || true mkdir -p /tmp/python-build/tmp mkdir -p /tmp/python-build/devshm mkdir -p /tmp/python-build/working @@ -97,6 +95,9 @@ fi # Note about `--hostname localhost` (b/158137436) # hostname defaults to the container name which fails DNS name # resolution with --net=none (required to keep tests hermetic). See details in bug. +# +# Note about CLOUDSDK_CONFIG=/tmp/.config/gcloud +# We use the /tmp dir since the filesystem is --read-only and we need writable space for gcloud configs. docker run --rm -t --read-only --net=none \ -e HOME=/tmp -e KAGGLE_DATA_PROXY_TOKEN=test-key \ -e KAGGLE_USER_SECRETS_TOKEN_KEY=test-secrets-key \ @@ -105,6 +106,7 @@ docker run --rm -t --read-only --net=none \ -e KAGGLE_DATA_PROXY_PROJECT=test \ -e TF_FORCE_GPU_ALLOW_GROWTH=true \ -e XLA_PYTHON_CLIENT_PREALLOCATE=false \ + -e CLOUDSDK_CONFIG=/tmp/.config/gcloud \ --hostname localhost \ --shm-size=2g \ -v $PWD:/input:ro -v /tmp/python-build/working:/working \ diff --git a/tests/common.py b/tests/common.py index 30a7bb0f..469033dd 100644 --- a/tests/common.py +++ b/tests/common.py @@ -11,7 +11,10 @@ def getAcceleratorName(): except FileNotFoundError: return("nvidia-smi not found.") -gpu_test = unittest.skipIf(len(os.environ.get('CUDA_VERSION', '')) == 0, 'Not running GPU tests') +def isGPU(): + return os.path.isfile('/proc/driver/nvidia/version') + +gpu_test = unittest.skipIf(not isGPU(), 'Not running GPU tests') # b/342143152 P100s are slowly being unsupported in new release of popular ml tools such as RAPIDS. p100_exempt = unittest.skipIf(getAcceleratorName() == "Tesla P100-PCIE-16GB", 'Not running p100 exempt tests') tpu_test = unittest.skipIf(len(os.environ.get('ISTPUVM', '')) == 0, 'Not running TPU tests') diff --git a/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/2/metadata.json b/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/2/metadata.json deleted file mode 100755 index e6beacde..00000000 --- a/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/2/metadata.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "keras_version": "3.0.1", - "keras_nlp_version": "0.7.0", - "parameter_count": 4385920, - "date_saved": "2023-12-27@02:02:24" -} \ No newline at end of file diff --git a/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/2/tokenizer.json b/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/2/tokenizer.json deleted file mode 100755 index 48d99632..00000000 --- a/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/2/tokenizer.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "module": "keras_nlp.src.models.bert.bert_tokenizer", - "class_name": "BertTokenizer", - "config": { - "name": "bert_tokenizer", - "trainable": true, - "dtype": "int32", - "vocabulary": null, - "sequence_length": null, - "lowercase": true, - "strip_accents": false, - "split": true, - "suffix_indicator": "##", - "oov_token": "[UNK]" - }, - "registered_name": "keras_nlp>BertTokenizer", - "assets": [ - "assets/tokenizer/vocabulary.txt" - ], - "weights": null -} \ No newline at end of file diff --git a/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/2/assets/tokenizer/vocabulary.txt b/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/3/assets/tokenizer/vocabulary.txt old mode 100755 new mode 100644 similarity index 100% rename from tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/2/assets/tokenizer/vocabulary.txt rename to tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/3/assets/tokenizer/vocabulary.txt diff --git a/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/2/config.json b/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/3/config.json similarity index 68% rename from tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/2/config.json rename to tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/3/config.json index 3afddd31..94aa0b65 100755 --- a/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/2/config.json +++ b/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/3/config.json @@ -1,5 +1,5 @@ { - "module": "keras_nlp.src.models.bert.bert_backbone", + "module": "keras_hub.src.models.bert.bert_backbone", "class_name": "BertBackbone", "config": { "name": "bert_backbone", @@ -13,7 +13,5 @@ "max_sequence_length": 512, "num_segments": 2 }, - "registered_name": "keras_nlp>BertBackbone", - "assets": [], - "weights": "model.weights.h5" + "registered_name": "keras_hub>BertBackbone" } \ No newline at end of file diff --git a/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/3/metadata.json b/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/3/metadata.json new file mode 100755 index 00000000..db25ecad --- /dev/null +++ b/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/3/metadata.json @@ -0,0 +1,10 @@ +{ + "keras_version": "3.7.0", + "keras_hub_version": "0.19.0", + "parameter_count": 4385920, + "date_saved": "2024-12-20@19:42:50", + "tasks": [ + "MaskedLM", + "TextClassifier" + ] +} \ No newline at end of file diff --git a/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/3/model.weights.h5 b/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/3/model.weights.h5 new file mode 100755 index 00000000..2951f93d Binary files /dev/null and b/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/3/model.weights.h5 differ diff --git a/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/3/tokenizer.json b/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/3/tokenizer.json new file mode 100755 index 00000000..d32697cc --- /dev/null +++ b/tests/data/kagglehub/models/keras/bert/keras/bert_tiny_en_uncased/3/tokenizer.json @@ -0,0 +1,27 @@ +{ + "module": "keras_hub.src.models.bert.bert_tokenizer", + "class_name": "BertTokenizer", + "config": { + "name": "bert_tokenizer", + "trainable": true, + "dtype": { + "module": "keras", + "class_name": "DTypePolicy", + "config": { + "name": "int32" + }, + "registered_name": null + }, + "config_file": "tokenizer.json", + "vocabulary": null, + "sequence_length": null, + "lowercase": true, + "strip_accents": false, + "split": true, + "suffix_indicator": "##", + "oov_token": "[UNK]", + "special_tokens": null, + "special_tokens_in_strings": false + }, + "registered_name": "keras_hub>BertTokenizer" +} \ No newline at end of file diff --git a/tests/test_annoy.py b/tests/test_annoy.py deleted file mode 100644 index 93b7d0c2..00000000 --- a/tests/test_annoy.py +++ /dev/null @@ -1,11 +0,0 @@ -import unittest - -from annoy import AnnoyIndex - - -class TestAnnoy(unittest.TestCase): - def test_tree(self): - t = AnnoyIndex(5, 'angular') - t.add_item(1, [1,2,3,4,5]) - - self.assertTrue(t.build(1)) diff --git a/tests/test_automl.py b/tests/test_automl.py deleted file mode 100644 index 63c34c69..00000000 --- a/tests/test_automl.py +++ /dev/null @@ -1,137 +0,0 @@ -import unittest - -from unittest.mock import Mock, patch - -from kaggle_gcp import KaggleKernelCredentials, init_automl -from test.support.os_helper import EnvironmentVarGuard -from google.cloud import storage, automl_v1beta1, automl - -def _make_credentials(): - import google.auth.credentials - return Mock(spec=google.auth.credentials.Credentials) - -class TestAutoMl(unittest.TestCase): - - class FakeClient: - def __init__(self, credentials=None, client_info=None, **kwargs): - self.credentials = credentials - - class FakeConnection(): - def __init__(self, user_agent): - self.user_agent = user_agent - if (client_info is not None): - self._connection = FakeConnection(client_info.user_agent) - - @patch("google.cloud.automl.AutoMlClient", new=FakeClient) - def test_user_provided_credentials(self): - credentials = _make_credentials() - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - init_automl() - client = automl.AutoMlClient(credentials=credentials) - self.assertNotIsInstance(client.credentials, KaggleKernelCredentials) - self.assertIsNotNone(client.credentials) - - def test_tables_gcs_client(self): - # The GcsClient can't currently be monkeypatched for default - # credentials because it requires a project which can't be set. - # Verify that creating an automl_v1beta1.GcsClient given an actual - # storage.Client sets the client properly. - gcs_client = storage.Client(project="xyz", credentials=_make_credentials()) - tables_gcs_client = automl_v1beta1.GcsClient(client=gcs_client) - self.assertIs(tables_gcs_client.client, gcs_client) - - @patch("google.cloud.automl_v1beta1.gapic.auto_ml_client.AutoMlClient", new=FakeClient) - def test_tables_client_credentials(self): - credentials = _make_credentials() - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - init_automl() - tables_client = automl_v1beta1.TablesClient(credentials=credentials) - self.assertEqual(tables_client.auto_ml_client.credentials, credentials) - - @patch("google.cloud.automl.AutoMlClient", new=FakeClient) - def test_default_credentials_automl_client(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - init_automl() - automl_client = automl.AutoMlClient() - self.assertIsNotNone(automl_client.credentials) - self.assertIsInstance(automl_client.credentials, KaggleKernelCredentials) - self.assertTrue(automl_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) - - @patch("google.cloud.automl_v1beta1.AutoMlClient", new=FakeClient) - def test_default_credentials_automl_v1beta1_client(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - init_automl() - automl_client = automl_v1beta1.AutoMlClient() - self.assertIsNotNone(automl_client.credentials) - self.assertIsInstance(automl_client.credentials, KaggleKernelCredentials) - self.assertTrue(automl_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) - - @patch("google.cloud.automl_v1beta1.TablesClient", new=FakeClient) - def test_default_credentials_tables_client(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - init_automl() - tables_client = automl_v1beta1.TablesClient() - self.assertIsNotNone(tables_client.credentials) - self.assertIsInstance(tables_client.credentials, KaggleKernelCredentials) - self.assertTrue(tables_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) - - @patch("google.cloud.automl.PredictionServiceClient", new=FakeClient) - def test_default_credentials_prediction_client(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - prediction_client = automl.PredictionServiceClient() - self.assertIsNotNone(prediction_client.credentials) - self.assertIsInstance(prediction_client.credentials, KaggleKernelCredentials) - self.assertTrue(prediction_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) - - @patch("google.cloud.automl_v1beta1.PredictionServiceClient", new=FakeClient) - def test_default_credentials_prediction_v1beta1_client(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - prediction_client = automl_v1beta1.PredictionServiceClient() - self.assertIsNotNone(prediction_client.credentials) - self.assertIsInstance(prediction_client.credentials, KaggleKernelCredentials) - self.assertTrue(prediction_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) - - def test_monkeypatching_idempotent(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - client1 = automl.AutoMlClient.__init__ - init_automl() - client2 = automl.AutoMlClient.__init__ - self.assertEqual(client1, client2) - - @patch("google.cloud.automl_v1beta1.PredictionServiceClient", new=FakeClient) - def test_legacy_AUTOML_variable_v1beta1_client(self): - """ - Tests previous KAGGLE_KERNEL_INTEGRATIONS="AUTOML" environment setting - """ - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'AUTOML') - with env: - prediction_client = automl_v1beta1.PredictionServiceClient() - self.assertIsNotNone(prediction_client.credentials) - self.assertIsInstance(prediction_client.credentials, KaggleKernelCredentials) - self.assertTrue(prediction_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) \ No newline at end of file diff --git a/tests/test_datashader.py b/tests/test_datashader.py deleted file mode 100644 index ad3afe15..00000000 --- a/tests/test_datashader.py +++ /dev/null @@ -1,42 +0,0 @@ -import unittest - -from common import p100_exempt - -class TestDatashader(unittest.TestCase): - - @p100_exempt # b/342143152: Uses cuDF(>=24.4v), which is no longer capitble with p100 GPUs. - def test_pipeline(self): - # based on https://github.com/pyviz/datashader/blob/master/datashader/tests/test_pipeline.py - import numpy as np - import pandas as pd - import datashader as ds - import datashader.transfer_functions as tf - - df = pd.DataFrame({ - 'x': np.array(([0.] * 10 + [1] * 10)), - 'y': np.array(([0.] * 5 + [1] * 5 + [0] * 5 + [1] * 5)), - 'f64': np.arange(20, dtype='f8') - }) - df.f64.iloc[2] = np.nan - - cvs = ds.Canvas(plot_width=2, plot_height=2, x_range=(0, 1), y_range=(0, 1)) - - pipeline = ds.Pipeline(df, ds.Point('x', 'y')) - img = pipeline((0, 1), (0, 1), 2, 2) - agg = cvs.points(df, 'x', 'y', ds.count()) - self.assertTrue(img.equals(tf.shade(agg))) - - color_fn = lambda agg: tf.shade(agg, 'pink', 'red') - pipeline.color_fn = color_fn - img = pipeline((0, 1), (0, 1), 2, 2) - self.assertTrue(img.equals(color_fn(agg))) - - transform_fn = lambda agg: agg + 1 - pipeline.transform_fn = transform_fn - img = pipeline((0, 1), (0, 1), 2, 2) - self.assertTrue(img.equals(color_fn(transform_fn(agg)))) - - pipeline = ds.Pipeline(df, ds.Point('x', 'y'), ds.sum('f64')) - img = pipeline((0, 1), (0, 1), 2, 2) - agg = cvs.points(df, 'x', 'y', ds.sum('f64')) - self.assertTrue(img.equals(tf.shade(agg))) diff --git a/tests/test_fastai.py b/tests/test_fastai.py index 0de1f82f..33a436a5 100644 --- a/tests/test_fastai.py +++ b/tests/test_fastai.py @@ -1,34 +1,36 @@ import unittest import fastai - from fastai.tabular.all import * + class TestFastAI(unittest.TestCase): - # Basic import - def test_basic(self): - import fastai - import fastcore - import fastprogress - import fastdownload - - def test_has_version(self): - self.assertGreater(len(fastai.__version__), 2) - - # based on https://github.com/fastai/fastai/blob/master/tests/test_torch_core.py#L17 - def test_torch_tensor(self): - a = tensor([1, 2, 3]) - b = torch.tensor([1, 2, 3]) - - self.assertTrue(torch.all(a == b)) - - def test_tabular(self): - dls = TabularDataLoaders.from_csv( - "/input/tests/data/train.csv", - cont_names=["pixel"+str(i) for i in range(784)], - y_names='label', - procs=[FillMissing, Categorify, Normalize]) - learn = tabular_learner(dls, layers=[200, 100]) - learn.fit_one_cycle(n_epoch=1) - - self.assertGreater(learn.smooth_loss, 0) + # Basic import + def test_basic(self): + import fastai + import fastcore + import fastprogress + import fastdownload + + def test_has_version(self): + self.assertGreater(len(fastai.__version__), 2) + + # based on https://github.com/fastai/fastai/blob/master/tests/test_torch_core.py#L17 + def test_torch_tensor(self): + a = tensor([1, 2, 3]) + b = torch.tensor([1, 2, 3]) + + self.assertTrue(torch.all(a == b)) + + def test_tabular(self): + dls = TabularDataLoaders.from_csv( + "/input/tests/data/train.csv", + cont_names=["pixel" + str(i) for i in range(784)], + y_names="label", + procs=[FillMissing, Categorify, Normalize], + ) + learn = tabular_learner(dls, layers=[200, 100]) + with learn.no_bar(): + learn.fit_one_cycle(n_epoch=1) + + self.assertGreater(learn.smooth_loss, 0) diff --git a/tests/test_gcs.py b/tests/test_gcs.py index eb15ea5f..94da58c9 100644 --- a/tests/test_gcs.py +++ b/tests/test_gcs.py @@ -8,7 +8,9 @@ def _make_credentials(): import google.auth.credentials - return Mock(spec=google.auth.credentials.Credentials) + credentials = Mock(spec=google.auth.credentials.Credentials) + credentials.universe_domain = 'googleapis.com' + return credentials class TestStorage(unittest.TestCase): diff --git a/tests/test_geopandas.py b/tests/test_geopandas.py deleted file mode 100644 index 4c0106b2..00000000 --- a/tests/test_geopandas.py +++ /dev/null @@ -1,16 +0,0 @@ -import unittest - -import geopandas - -class TestGeopandas(unittest.TestCase): - def test_read(self): - df = geopandas.read_file(geopandas.datasets.get_path('nybb')) - self.assertTrue(df.size > 1) - - def test_spatial_join(self): - cities = geopandas.read_file(geopandas.datasets.get_path('naturalearth_cities')) - world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) - countries = world[['geometry', 'name']] - countries = countries.rename(columns={'name':'country'}) - cities_with_country = geopandas.sjoin(cities, countries, how="inner", op='intersects') - self.assertTrue(cities_with_country.size > 1) diff --git a/tests/test_google_genai_patch.py b/tests/test_google_genai_patch.py new file mode 100644 index 00000000..9d225763 --- /dev/null +++ b/tests/test_google_genai_patch.py @@ -0,0 +1,55 @@ +import json +import unittest +import threading + +from test.support.os_helper import EnvironmentVarGuard +from urllib.parse import urlparse + +from http.server import BaseHTTPRequestHandler, HTTPServer + +class HTTPHandler(BaseHTTPRequestHandler): + called = False + path = None + headers = {} + + def do_HEAD(self): + self.send_response(200) + + def do_POST(self): + HTTPHandler.path = self.path + HTTPHandler.headers = self.headers + HTTPHandler.called = True + self.send_response(200) + self.send_header("Content-type", "application/json") + self.end_headers() + +class TestGoogleGenAiPatch(unittest.TestCase): + endpoint = "http://127.0.0.1:80" + + def test_proxy_enabled(self): + env = EnvironmentVarGuard() + secrets_token = "secrets_token" + proxy_token = "proxy_token" + env.set("KAGGLE_USER_SECRETS_TOKEN", secrets_token) + env.set("KAGGLE_DATA_PROXY_TOKEN", proxy_token) + env.set("KAGGLE_DATA_PROXY_URL", self.endpoint) + server_address = urlparse(self.endpoint) + with env: + with HTTPServer((server_address.hostname, server_address.port), HTTPHandler) as httpd: + threading.Thread(target=httpd.serve_forever).start() + from google import genai + api_key = "NotARealAPIKey" + client = genai.Client(api_key = api_key) + try: + client.models.generate_content( + model="gemini-2.0-flash-exp", + contents="What's the largest planet in our solar system?" + ) + except: + pass + httpd.shutdown() + self.assertTrue(HTTPHandler.called) + self.assertIn("/palmapi", HTTPHandler.path) + self.assertEqual(proxy_token, HTTPHandler.headers["x-kaggle-proxy-data"]) + self.assertEqual("Bearer {}".format(secrets_token), HTTPHandler.headers["x-kaggle-authorization"]) + self.assertEqual(api_key, HTTPHandler.headers["x-goog-api-key"]) diff --git a/tests/test_google_import_adk.py b/tests/test_google_import_adk.py new file mode 100644 index 00000000..9ae11314 --- /dev/null +++ b/tests/test_google_import_adk.py @@ -0,0 +1,31 @@ +import json +import unittest +import threading +from urllib.parse import urlparse + +class TestGoogleADK(unittest.TestCase): + + def define_agent(self): + from google.adk.agents import Agent + from google.adk.models.google_llm import Gemini + from google.adk.runners import InMemoryRunner + from google.adk.tools import google_search + from google.genai import types + + retry_config = types.HttpRetryOptions( + attempts=5, # Maximum retry attempts + exp_base=7, # Delay multiplier + initial_delay=1, # Initial delay before first retry (in seconds) + http_status_codes=[429, 500, 503, 504] # Retry on these HTTP errors + ) + + root_agent = Agent( + name="helpful_assistant", + model=Gemini( + model="gemini-2.0-flash-lite", + retry_options=retry_config + ), + description="A simple agent that can answer general questions.", + instruction="You are a helpful assistant. Use Google Search for current info or if unsure.", + tools=[google_search], + ) diff --git a/tests/test_imports.py b/tests/test_imports.py index b22ebe7a..6c429516 100644 --- a/tests/test_imports.py +++ b/tests/test_imports.py @@ -3,6 +3,5 @@ class TestImport(unittest.TestCase): # Basic import tests for packages without any. def test_basic(self): - import bq_helper import tensorflow_datasets import segment_anything diff --git a/tests/test_jax.py b/tests/test_jax.py index b5e0898e..f8eca3bb 100644 --- a/tests/test_jax.py +++ b/tests/test_jax.py @@ -6,7 +6,7 @@ import jax import jax.numpy as np -from common import gpu_test +from common import gpu_test, isGPU from jax import grad, jit @@ -21,5 +21,5 @@ def test_grad(self): self.assertEqual(0.4199743, ag) def test_backend(self): - expected_backend = 'cpu' if len(os.environ.get('CUDA_VERSION', '')) == 0 else 'gpu' + expected_backend = 'cpu' if not isGPU() else 'gpu' self.assertEqual(expected_backend, jax.default_backend()) diff --git a/tests/test_jiter.py b/tests/test_jiter.py new file mode 100644 index 00000000..6b31925e --- /dev/null +++ b/tests/test_jiter.py @@ -0,0 +1,9 @@ +import unittest + +from distutils.version import StrictVersion + +import jiter + +class TestJiter(unittest.TestCase): + def test_version(self): + self.assertEqual(StrictVersion(jiter.__version__), StrictVersion("0.10.0")) diff --git a/tests/test_kagglehub.py b/tests/test_kagglehub.py index 37b11248..f2c3e2a6 100644 --- a/tests/test_kagglehub.py +++ b/tests/test_kagglehub.py @@ -8,8 +8,10 @@ class TestKagglehub(unittest.TestCase): def test_login(self): with self.assertLogs('kagglehub', level='INFO') as l: with mock.patch("builtins.input") as mock_input: - mock_input.side_effect = ["lastplacelarry", "some-key"] - # Disabling credentials validation since network access is disabled in unittest. - kagglehub.login(validate_credentials=False) + with mock.patch("getpass.getpass") as mock_getpass: + mock_input.side_effect = ["lastplacelarry"] + mock_getpass.return_value = "some-key" - self.assertIn("credentials set", l.output[0]) + kagglehub.login(validate_credentials=False) + + self.assertIn("credentials set", l.output[0]) diff --git a/tests/test_keras.py b/tests/test_keras.py index 22cb6f9f..5dc4610d 100644 --- a/tests/test_keras.py +++ b/tests/test_keras.py @@ -9,10 +9,11 @@ class TestKeras(unittest.TestCase): def test_train(self): - # Load the data and split it between train and test sets - (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data( - path='/input/tests/data/mnist.npz' - ) + path = '/input/tests/data/mnist.npz' + with np.load(path) as f: + x_train, y_train = f['x_train'], f['y_train'] + x_test, y_test = f['x_test'], f['y_test'] + # Scale images to the [0, 1] range x_train = x_train.astype("float32") / 255 diff --git a/tests/test_matplotlib.py b/tests/test_matplotlib.py index c04f3f23..125ccda4 100644 --- a/tests/test_matplotlib.py +++ b/tests/test_matplotlib.py @@ -8,10 +8,6 @@ import numpy as np class TestMatplotlib(unittest.TestCase): - def test_version(self): - # b/308525631: newer versions of Matplotlib causes learntools to fail - self.assertLess(StrictVersion(matplotlib.__version__), StrictVersion("3.8.0")) - def test_plot(self): plt.plot(np.linspace(0,1,50), np.random.rand(50)) plt.savefig("plot1.png") diff --git a/tests/test_numpy.py b/tests/test_numpy.py index 071c3d30..ab7ec03c 100644 --- a/tests/test_numpy.py +++ b/tests/test_numpy.py @@ -3,20 +3,11 @@ from distutils.version import StrictVersion import numpy as np -from numpy.distutils.system_info import get_info - -class TestNumpy(unittest.TestCase): - def test_version(self): - # b/370860329: newer versions are not capable with current tensorflow - self.assertEqual(StrictVersion(np.__version__), StrictVersion("1.26.4")) +import io +from contextlib import redirect_stdout +class TestNumpy(unittest.TestCase): def test_array(self): array = np.array([1, 3]) self.assertEqual((2,), array.shape) - - # Numpy must be linked to the MKL. (Occasionally, a third-party package will muck up the installation - # and numpy will be reinstalled with an OpenBLAS backing.) - def test_mkl(self): - # This will throw an exception if the MKL is not linked correctly or return an empty dict. - self.assertTrue(get_info("blas_mkl")) diff --git a/tests/test_pydegensac.py b/tests/test_pydegensac.py deleted file mode 100644 index be72b53e..00000000 --- a/tests/test_pydegensac.py +++ /dev/null @@ -1,18 +0,0 @@ -import unittest - -import pydegensac -import numpy as np - - -class TestPydegensac(unittest.TestCase): - def test_find_homography(self): - src_pts = np.float32([ [0,0],[0,1],[1,1],[1,0] ]).reshape(-1,2) - dst_pts = np.float32([ [0,0],[0,-1],[-1,-1],[-1,0] ]).reshape(-1,2) - - H, mask = pydegensac.findHomography(src_pts, dst_pts, 4, 1) - - self.assertEqual(3, len(H)) - self.assertEqual(4, len(mask)) - - - diff --git a/tests/test_qgrid.py b/tests/test_qgrid.py deleted file mode 100644 index e97ef2a1..00000000 --- a/tests/test_qgrid.py +++ /dev/null @@ -1,16 +0,0 @@ -import unittest - -import numpy as np -import pandas as pd - -from qgrid import QgridWidget - - -class TestQgrid(unittest.TestCase): - def test_nans(self): - df = pd.DataFrame([(pd.Timestamp('2017-02-02'), np.nan), - (4, 2), - ('foo', 'bar')]) - view = QgridWidget(df=df) - - self.assertIsNotNone(view.get_changed_df()) diff --git a/tests/test_tensorflow_cloud.py b/tests/test_tensorflow_cloud.py deleted file mode 100644 index 2875e121..00000000 --- a/tests/test_tensorflow_cloud.py +++ /dev/null @@ -1,8 +0,0 @@ -import unittest - -import tensorflow_cloud as tfc - - -class TestTensorflowCloud(unittest.TestCase): - def test_remote(self): - self.assertFalse(tfc.remote()) diff --git a/tests/test_torchtune.py b/tests/test_torchtune.py new file mode 100644 index 00000000..c4a702fd --- /dev/null +++ b/tests/test_torchtune.py @@ -0,0 +1,16 @@ +import unittest +import subprocess + +class TestTorchtune(unittest.TestCase): + def test_help(self): + result = subprocess.run( + ["tune", "--help"], + capture_output=True, + text=True + ) + + self.assertEqual(0, result.returncode) + self.assertIn( + "Download a model from the Hugging Face Hub or Kaggle", + result.stdout + ) diff --git a/tests/test_transformers.py b/tests/test_transformers.py index a81714cc..910eab30 100644 --- a/tests/test_transformers.py +++ b/tests/test_transformers.py @@ -1,7 +1,7 @@ import unittest import torch -from transformers import AdamW +import torch.optim as optim import transformers.pipelines # verify this import works @@ -10,13 +10,12 @@ def assertListAlmostEqual(self, list1, list2, tol): self.assertEqual(len(list1), len(list2)) for a, b in zip(list1, list2): self.assertAlmostEqual(a, b, delta=tol) - def test_adam_w(self): w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True) target = torch.tensor([0.4, 0.2, -0.5]) criterion = torch.nn.MSELoss() # No warmup, constant schedule, no gradient clipping - optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0) + optimizer = optim.AdamW(params=[w], lr=2e-1, weight_decay=0.0) for _ in range(100): loss = criterion(w, target) loss.backward() diff --git a/tests/test_translation.py b/tests/test_translation.py index 5bb41b62..52de2a08 100644 --- a/tests/test_translation.py +++ b/tests/test_translation.py @@ -6,7 +6,7 @@ from kaggle_gcp import KaggleKernelCredentials, KaggleKernelWithProjetCredentials, init_translation_v2, init_translation_v3 from test.support.os_helper import EnvironmentVarGuard from google.api_core import client_options -from google.cloud import translate, translate_v2 +from google.cloud import translate_v3 as translate, translate_v2 def _make_credentials(): import google.auth.credentials @@ -48,7 +48,7 @@ def test_user_provided_credentials_v2(self): self.assertIsNotNone(client.credentials) self.assertNotIsInstance(client.credentials, KaggleKernelCredentials) - @patch("google.cloud.translate.TranslationServiceClient", new=FakeClient) + @patch("google.cloud.translate_v3.TranslationServiceClient", new=FakeClient) def test_default_credentials_v3(self): env = EnvironmentVarGuard() env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') @@ -60,7 +60,7 @@ def test_default_credentials_v3(self): self.assertIsInstance(client.credentials, KaggleKernelCredentials) - @patch("google.cloud.translate.TranslationServiceClient", new=FakeClient) + @patch("google.cloud.translate_v3.TranslationServiceClient", new=FakeClient) def test_user_provided_credentials_v3(self): credentials = _make_credentials() env = EnvironmentVarGuard() @@ -107,13 +107,12 @@ def test_monkeypatching_idempotent(self): self.assertEqual(client2_1, client2_2) self.assertEqual(client3_1, client3_2) - @patch("google.cloud.translate.TranslationServiceClient", new=FakeClient) + @patch("google.cloud.translate_v3.TranslationServiceClient", new=FakeClient) def test_client_credential_uniqueness_v3(self): """ Client instance must use unique KaggleKernelWithProjetCredentials with quota_project_id when client_options.quota_project_id provided. (even if quota_project_id is same) """ - credentials = _make_credentials() env = EnvironmentVarGuard() env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') diff --git a/tests/test_user_secrets.py b/tests/test_user_secrets.py index 67c628f7..c11432fe 100644 --- a/tests/test_user_secrets.py +++ b/tests/test_user_secrets.py @@ -200,22 +200,6 @@ def call_get_cloudai_access_token(): client = UserSecretsClient() secret_response = client._get_cloudai_access_token() self.assertEqual(secret_response, (secret, now + timedelta(seconds=3600))) - def call_get_translation_access_token(): - client = UserSecretsClient() - secret_response = client._get_translation_access_token() - self.assertEqual(secret_response, (secret, now + timedelta(seconds=3600))) - def call_get_natural_lang_access_token(): - client = UserSecretsClient() - secret_response = client._get_natural_language_access_token() - self.assertEqual(secret_response, (secret, now + timedelta(seconds=3600))) - def call_get_video_intell_access_token(): - client = UserSecretsClient() - secret_response = client._get_video_intelligence_access_token() - self.assertEqual(secret_response, (secret, now + timedelta(seconds=3600))) - def call_get_vision_access_token(): - client = UserSecretsClient() - secret_response = client._get_vision_access_token() - self.assertEqual(secret_response, (secret, now + timedelta(seconds=3600))) self._test_client(call_get_bigquery_access_token, '/requests/GetUserSecretRequest', {'Target': GcpTarget.BIGQUERY.target}, diff --git a/tests/test_xgboost.py b/tests/test_xgboost.py index 618a63cc..68166813 100644 --- a/tests/test_xgboost.py +++ b/tests/test_xgboost.py @@ -17,10 +17,9 @@ def test_classifier(self): X_test = np.random.random((100, 28)) y_test = np.random.randint(10, size=(100, 1)) - xgb1 = XGBClassifier(n_estimators=3, use_label_encoder=False) + xgb1 = XGBClassifier(n_estimators=3, use_label_encoder=False, eval_metric='mlogloss') xgb1.fit( X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], - eval_metric='mlogloss' ) self.assertIn("validation_0", xgb1.evals_result()) diff --git a/tests/utils/kagglehub.py b/tests/utils/kagglehub.py index d7819dde..7a2a8995 100644 --- a/tests/utils/kagglehub.py +++ b/tests/utils/kagglehub.py @@ -1,3 +1,4 @@ +import json import os import threading import re @@ -7,6 +8,8 @@ from test.support.os_helper import EnvironmentVarGuard from http.server import BaseHTTPRequestHandler, HTTPServer +from kagglesdk.kaggle_env import get_endpoint, get_env + class KaggleAPIHandler(BaseHTTPRequestHandler): """ Fake Kaggle API server supporting the download endpoint. @@ -15,15 +18,18 @@ class KaggleAPIHandler(BaseHTTPRequestHandler): def do_HEAD(self): self.send_response(200) - def do_GET(self): - m = re.match("^/api/v1/models/(.+)/download/(.+)$", self.path) - if not m: + def do_POST(self): + content_length = int(self.headers.get('Content-Length', 0)) + body_bytes = self.rfile.read(content_length) + request_body = json.loads(body_bytes.decode('utf-8')) + + if self.path != "/api/v1/models.ModelApiService/DownloadModelInstanceVersion": self.send_response(404) self.wfile.write(bytes(f"Unhandled path: {self.path}", "utf-8")) return - model_handle = m.group(1) - path = m.group(2) + model_handle = f"{request_body["ownerSlug"]}/{request_body["modelSlug"]}/keras/{request_body["instanceSlug"]}/{request_body["versionNumber"]}" + path = request_body["path"] filepath = f"/input/tests/data/kagglehub/models/{model_handle}/{path}" if not os.path.isfile(filepath): self.send_error(404, "Internet is disabled in our tests " @@ -41,14 +47,12 @@ def do_GET(self): @contextmanager def create_test_kagglehub_server(): - endpoint = 'http://localhost:7777' env = EnvironmentVarGuard() - env.set('KAGGLE_API_ENDPOINT', endpoint) - test_server_address = urlparse(endpoint) + env.set('KAGGLE_API_ENVIRONMENT', 'TEST') with env: - if not test_server_address.hostname or not test_server_address.port: - msg = f"Invalid test server address: {endpoint}. You must specify a hostname & port" - raise ValueError(msg) + endpoint = get_endpoint(get_env()) + test_server_address = urlparse(endpoint) + with HTTPServer((test_server_address.hostname, test_server_address.port), KaggleAPIHandler) as httpd: threading.Thread(target=httpd.serve_forever).start() diff --git a/tpu/Dockerfile b/tpu/Dockerfile index b94619da..343443ae 100644 --- a/tpu/Dockerfile +++ b/tpu/Dockerfile @@ -6,12 +6,9 @@ FROM $BASE_IMAGE # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact ARG PYTHON_WHEEL_VERSION ARG PYTHON_VERSION_PATH -ARG TF_LINUX_WHEEL_VERSION +ARG TENSORFLOW_VERSION ARG TORCH_LINUX_WHEEL_VERSION ARG TORCH_VERSION -ARG TENSORFLOW_VERSION -ARG TF_LIBTPU_VERSION -ARG JAX_VERSION ARG TORCHVISION_VERSION ARG TORCHAUDIO_VERSION @@ -28,59 +25,55 @@ ADD patches/kaggle_session.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packa ADD patches/kaggle_web_client.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packages/kaggle_web_client.py ADD patches/kaggle_datasets.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packages/kaggle_datasets.py -# Disable GCP integrations for now -# ADD patches/kaggle_gcp.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packages/kaggle_gcp.py - -# Disable logging to file (why do we need this?) -# ADD patches/log.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packages/log.py - -# sitecustomize adds significant latency to ipython kernel startup and should only be added if needed -# ADD patches/sitecustomize.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packages/sitecustomize.py - # Prereqs # This is needed for cv2 (opencv-python): # https://stackoverflow.com/questions/55313610/importerror-libgl-so-1-cannot-open-shared-object-file-no-such-file-or-directo RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y # Install all the packages together for maximum compatibility. - -# Install Tensorflow. - -# Install Pytorch & related packages -# https://cloud.google.com/tpu/docs/pytorch-xla-ug-tpu-vm#changing_pytorch_version -# The URL doesn't include patch version. i.e. must use 1.11 instead of 1.11.0 -# We need to keep the numpy version the same as the installed tf one but compatible with other installs. - -# Install JAX & related packages -# https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm#install_jax_on_your_cloud_tpu_vm - -# Packages needed by the Notebook editor - -# Additional useful packages should be added here - -RUN pip install tensorflow_hub https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/tf-${TENSORFLOW_VERSION}/tensorflow-${TENSORFLOW_VERSION}-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TF_LINUX_WHEEL_VERSION}.whl tensorflow-probability tensorflow-io \ - torch~=${TORCH_VERSION} https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}+libtpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl torchvision==${TORCHVISION_VERSION} torchaudio==${TORCHAUDIO_VERSION} \ - jax[tpu]==${JAX_VERSION} -f https://storage.googleapis.com/jax-releases/libtpu_releases.html trax flax optax git+https://github.com/deepmind/dm-haiku jraph distrax \ - papermill jupyterlab python-lsp-server[all] "jupyter-lsp==1.5.1" \ - pandas matplotlib opencv-python-headless librosa accelerate diffusers scikit-learn transformers \ - seaborn timm albumentations einops pyarrow fastparquet opencv-python \ - "keras>3" keras-cv keras-nlp \ - kagglehub && \ +# Additional useful packages should be added in the requirements.txt +# Bring in the requirements.txt and replace variables in it: +RUN apt-get install -y gettext +ADD tpu/requirements.in /kaggle_requirements.in +RUN envsubst < /kaggle_requirements.in > /requirements.in + +# Install uv and then install the requirements: +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +RUN export PATH="${HOME}/.local/bin:${PATH}" && \ + uv pip compile --system --prerelease=allow \ + --verbose \ + --upgrade \ + --find-links=https://storage.googleapis.com/jax-releases/libtpu_releases.html \ + --find-links=https://storage.googleapis.com/libtpu-releases/index.html \ + --find-links=https://storage.googleapis.com/libtpu-wheels/index.html \ + --find-links=https://download.pytorch.org/whl/torch_stable.html \ + --emit-find-links \ + --no-emit-package pip \ + --no-emit-package setuptools \ + --output-file /requirements.txt \ + /requirements.in && \ + uv pip install --system --prerelease=allow --force-reinstall \ + -r /requirements.txt && \ + uv cache clean && \ + /tmp/clean-layer.sh +ENV PATH="~/.local/bin:${PATH}" + +# We install a libtpu version compatible with both jax 0.7.2 and torch 2.8.0. +# Why? tunix latest -> flax 0.12 -> jax 0.7.2 -> libtpu 0.0.23. However, that +# libtpu causes pjrt api errors for torch 2.8.0. screenshot/5heUtdyaJ4MmR3D +# https://github.com/pytorch/xla/blob/d517649bdef6ab0519c30c704bde8779c8216502/setup.py#L111 +# https://github.com/jax-ml/jax/blob/3489529b38d1f11d1e5caf4540775aadd5f2cdda/setup.py#L26 +RUN export PATH="${HOME}/.local/bin:${PATH}" && \ + uv pip install --system --force-reinstall libtpu==0.0.17 && \ + uv cache clean && \ /tmp/clean-layer.sh - -# Tensorflow libtpu: -RUN curl --output /usr/local/lib/python3.10/site-packages/libtpu/libtpu.so https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/${TF_LIBTPU_VERSION}/libtpu.so # Kaggle Model Hub patches: ADD patches/kaggle_module_resolver.py /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/kaggle_module_resolver.py RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/config.py RUN sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/config.py -# Monkey-patch the default TPU to the local (TPU VM). -RUN sed -i 's/tpu=None,/tpu="local",/' /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py - # Set these env vars so that they don't produce errs calling the metadata server to load them: -ENV TPU_ACCELERATOR_TYPE=v3-8 ENV TPU_PROCESS_ADDRESSES=local # Metadata @@ -92,7 +85,6 @@ LABEL build-date=$BUILD_DATE ENV GIT_COMMIT=${GIT_COMMIT} ENV BUILD_DATE=${BUILD_DATE} -LABEL tensorflow-version=$TENSORFLOW_VERSION LABEL kaggle-lang=python # Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`. diff --git a/tpu/config.txt b/tpu/config.txt index 4ce1c196..ab933ba7 100644 --- a/tpu/config.txt +++ b/tpu/config.txt @@ -1,17 +1,12 @@ -BASE_IMAGE=python:3.10 -PYTHON_WHEEL_VERSION=cp310 -PYTHON_VERSION_PATH=python3.10 -# gsutil ls gs://cloud-tpu-tpuvm-artifacts/tensorflow -# https://cloud.google.com/tpu/docs/supported-tpu-configurations#libtpu_versions -TENSORFLOW_VERSION=2.16.1 -TF_LIBTPU_VERSION=1.10.1 -TF_LINUX_WHEEL_VERSION=manylinux_2_17_x86_64.manylinux2014_x86_64 -JAX_VERSION=0.4.23 -# gsutil ls gs://pytorch-xla-releases/wheels/tpuvm/* | grep libtpu | grep -v -E ".*rc[0-9].*" +BASE_IMAGE=python:3.12 +PYTHON_WHEEL_VERSION=cp312 +PYTHON_VERSION_PATH=python3.12 +TENSORFLOW_VERSION=2.20.0 +# gsutil ls gs://pytorch-xla-releases/wheels/tpuvm/* | grep libtpu | grep torch_xla | grep -v -E ".*rc[0-9].*" | sed 's/.*torch_xla-\(.*\)+libtpu.*/\1/' | sort -rV # Supports nightly -TORCH_VERSION=2.4.0 +TORCH_VERSION=2.8.0 # https://github.com/pytorch/audio supports nightly -TORCHAUDIO_VERSION=2.4.0 +TORCHAUDIO_VERSION=2.8.0 # https://github.com/pytorch/vision supports nightly -TORCHVISION_VERSION=0.19.0 +TORCHVISION_VERSION=0.23.0 TORCH_LINUX_WHEEL_VERSION=manylinux_2_28_x86_64 diff --git a/tpu/requirements.in b/tpu/requirements.in new file mode 100644 index 00000000..1fceeebb --- /dev/null +++ b/tpu/requirements.in @@ -0,0 +1,54 @@ +# TPU Utils +tpu-info +# Tensorflow packages +# TODO: b/447621961 - re-enable tensorflow-tpu when a compatible libtpu can be found. +tensorflow-cpu==${TENSORFLOW_VERSION} +tensorflow_hub +tensorflow-io +tensorflow-probability +tensorflow_datasets +# Torch packages +https://download.pytorch.org/whl/cpu/torch-${TORCH_VERSION}%2Bcpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl +https://download.pytorch.org/whl/cpu/torchaudio-${TORCHAUDIO_VERSION}%2Bcpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl +https://download.pytorch.org/whl/cpu/torchvision-${TORCHVISION_VERSION}%2Bcpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl +https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl +# Jax packages +jax[tpu] +distrax +flax +git+https://github.com/deepmind/dm-haiku +jraph +optax +trax +# Tunix GRPO +git+https://github.com/google/tunix +git+https://github.com/google/qwix +grain +# Jupyter packages +jupyter-lsp==1.5.1 +jupyterlab +notebook +papermill +python-lsp-server[all] +# Keras Packages +keras>3 +keras-cv +keras-nlp +# Kaggle Packages +kagglehub +# Other useful packages, add more here +accelerate +albumentations +diffusers +einops +fastparquet +ipywidgets +matplotlib +opencv-python +opencv-python-headless +pandas +pyarrow +scikit-learn +seaborn +timm +transformers