diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000..e6990cd3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,23 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: bug, help wanted +assignees: '' +--- + +## 🐛 Bug + + + +### To Reproduce + + + +### Expected behavior + + + +### Additional context + + diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 00000000..d999a7b8 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,19 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: enhancement +assignees: '' +--- + +## 🚀 Feature + + + +### Motivation + + + +### Additional context + + diff --git a/.gitignore b/.gitignore index 0d038d25..ef82380f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.pyc .idea/ .vscode -.mypy_cache \ No newline at end of file +.mypy_cache +.generated \ No newline at end of file diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 066e6494..00000000 --- a/Dockerfile +++ /dev/null @@ -1,526 +0,0 @@ -ARG BASE_TAG=m66 -ARG TENSORFLOW_VERSION=2.4.1 - -FROM gcr.io/kaggle-images/python-tensorflow-whl:${TENSORFLOW_VERSION}-py37-2 as tensorflow_whl -FROM gcr.io/deeplearning-platform-release/base-cpu:${BASE_TAG} - -ADD clean-layer.sh /tmp/clean-layer.sh -ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl -ADD patches/template_conf.json /opt/kaggle/conf.json - -# This is necessary for apt to access HTTPS sources -RUN apt-get update && \ - apt-get install apt-transport-https && \ - /tmp/clean-layer.sh - - # Use a fixed apt-get repo to stop intermittent failures due to flaky httpredir connections, - # as described by Lionel Chan at http://stackoverflow.com/a/37426929/5881346 -RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list && \ - apt-get update && \ - # Needed by vowpalwabbit & lightGBM (GPU build). - # https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Python#installing - # https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm - apt-get install -y build-essential unzip cmake && \ - apt-get install -y libboost-dev libboost-program-options-dev libboost-system-dev libboost-thread-dev libboost-math-dev libboost-test-dev libboost-python-dev libboost-filesystem-dev zlib1g-dev && \ - # b/182601974: ssh client was removed from the base image but is required for packages such as stable-baselines. - apt-get install -y openssh-client && \ - /tmp/clean-layer.sh - -# Make sure the dynamic linker finds the right libstdc++ -ENV LD_LIBRARY_PATH=/opt/conda/lib -# b/128333086: Set PROJ_LIB to points to the proj4 cartographic library. -ENV PROJ_LIB=/opt/conda/share/proj - -# Install conda packages not available on pip. -# When using pip in a conda environment, conda commands should be ran first and then -# the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/ -# Using the same global consistent ordered list of channels -RUN conda config --add channels conda-forge && \ - conda config --add channels nvidia && \ - conda config --add channels pytorch && \ - conda config --add channels rapidsai && \ - # ^ rapidsai is the highest priority channel, default lowest, conda-forge 2nd lowest. - # b/182405233 pyproj 3.x is not compatible with basemap 1.2.1 - # b/161473620#comment7 pin required to prevent resolver from picking pysal 1.x., pysal 2.2.x is also downloading data on import. - conda install matplotlib basemap cartopy python-igraph imagemagick "pyproj=2.6" "pysal==2.1.0" && \ - conda install "pytorch=1.7" "torchvision=0.8" "torchaudio=0.7" "torchtext=0.8" cpuonly && \ - /tmp/clean-layer.sh - -# The anaconda base image includes outdated versions of these packages. Update them to include the latest version. -RUN pip install seaborn python-dateutil dask && \ - pip install pyyaml joblib husl geopy ml_metrics mne pyshp && \ - pip install pandas && \ - # Install h2o from source. - # Use `conda install -c h2oai h2o` once Python 3.7 version is released to conda. - apt-get install -y default-jre-headless && \ - pip install -f https://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o && \ - /tmp/clean-layer.sh - -# Install tensorflow from a pre-built wheel -COPY --from=tensorflow_whl /tmp/tensorflow_cpu/*.whl /tmp/tensorflow_cpu/ -RUN pip install /tmp/tensorflow_cpu/tensorflow*.whl && \ - rm -rf /tmp/tensorflow_cpu && \ - /tmp/clean-layer.sh - -# Install tensorflow-gcs-config from a pre-built wheel -COPY --from=tensorflow_whl /tmp/tensorflow_gcs_config/*.whl /tmp/tensorflow_gcs_config/ -RUN pip install /tmp/tensorflow_gcs_config/tensorflow*.whl && \ - rm -rf /tmp/tensorflow_gcs_config && \ - /tmp/clean-layer.sh - -# Install TensorFlow addons (TFA). -COPY --from=tensorflow_whl /tmp/tfa_cpu/*.whl /tmp/tfa_cpu/ -RUN pip install /tmp/tfa_cpu/tensorflow*.whl && \ - rm -rf /tmp/tfa_cpu/ && \ - /tmp/clean-layer.sh - -RUN apt-get install -y libfreetype6-dev && \ - apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing && \ - pip install gensim && \ - pip install textblob && \ - pip install wordcloud && \ - pip install xgboost && \ - # Pinned to match GPU version. Update version together. - pip install lightgbm==3.2.0 && \ - pip install pydot && \ - pip install keras && \ - pip install keras-tuner && \ - pip install flake8 && \ - # Pinned because it breaks theano test with the latest version (b/178107003). - pip install theano-pymc==1.0.11 && \ - pip install python-Levenshtein && \ - pip install hep_ml && \ - # NLTK Project datasets - mkdir -p /usr/share/nltk_data && \ - # NLTK Downloader no longer continues smoothly after an error, so we explicitly list - # the corpuses that work - # "yes | ..." answers yes to the retry prompt in case of an error. See b/133762095. - yes | python -m nltk.downloader -d /usr/share/nltk_data abc alpino averaged_perceptron_tagger \ - basque_grammars biocreative_ppi bllip_wsj_no_aux \ - book_grammars brown brown_tei cess_cat cess_esp chat80 city_database cmudict \ - comtrans conll2000 conll2002 conll2007 crubadan dependency_treebank \ - europarl_raw floresta gazetteers genesis gutenberg \ - ieer inaugural indian jeita kimmo knbc large_grammars lin_thesaurus mac_morpho machado \ - masc_tagged maxent_ne_chunker maxent_treebank_pos_tagger moses_sample movie_reviews \ - mte_teip5 names nps_chat omw opinion_lexicon paradigms \ - pil pl196x porter_test ppattach problem_reports product_reviews_1 product_reviews_2 propbank \ - pros_cons ptb punkt qc reuters rslp rte sample_grammars semcor senseval sentence_polarity \ - sentiwordnet shakespeare sinica_treebank smultron snowball_data spanish_grammars \ - state_union stopwords subjectivity swadesh switchboard tagsets timit toolbox treebank \ - twitter_samples udhr2 udhr unicode_samples universal_tagset universal_treebanks_v20 \ - vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe && \ - # Stop-words - pip install stop-words && \ - pip install scikit-image && \ - /tmp/clean-layer.sh - -RUN pip install ibis-framework && \ - pip install mxnet && \ - pip install gluonnlp && \ - pip install gluoncv && \ - /tmp/clean-layer.sh - -RUN pip install scipy && \ - # b/176817038 avoid upgrade to 0.24 which is causing issues with hep-ml package. - pip install scikit-learn==0.23.2 && \ - # HDF5 support - pip install h5py && \ - pip install biopython && \ - # PUDB, for local debugging convenience - pip install pudb && \ - pip install imbalanced-learn && \ - # Profiling and other utilities - pip install line_profiler && \ - pip install orderedmultidict && \ - pip install smhasher && \ - pip install bokeh && \ - pip install numba && \ - pip install datashader && \ - # Boruta (python implementation) - pip install Boruta && \ - apt-get install -y graphviz && pip install graphviz && \ - # Pandoc is a dependency of deap - apt-get install -y pandoc && \ - pip install git+git://github.com/scikit-learn-contrib/py-earth.git@issue191 && \ - pip install essentia && \ - /tmp/clean-layer.sh - -# vtk with dependencies -RUN apt-get install -y libgl1-mesa-glx && \ - pip install vtk && \ - # xvfbwrapper with dependencies - apt-get install -y xvfb && \ - pip install xvfbwrapper && \ - /tmp/clean-layer.sh - -RUN pip install mpld3 && \ - pip install gpxpy && \ - pip install arrow && \ - pip install nilearn && \ - pip install nibabel && \ - pip install pronouncing && \ - pip install markovify && \ - pip install imgaug && \ - pip install preprocessing && \ - pip install path.py && \ - pip install Geohash && \ - # https://github.com/vinsci/geohash/issues/4 - sed -i -- 's/geohash/.geohash/g' /opt/conda/lib/python3.7/site-packages/Geohash/__init__.py && \ - pip install deap && \ - pip install tpot && \ - pip install scikit-optimize && \ - pip install haversine && \ - pip install toolz cytoolz && \ - pip install plotly && \ - pip install hyperopt && \ - pip install fitter && \ - pip install langid && \ - # Delorean. Useful for dealing with datetime - pip install delorean && \ - pip install trueskill && \ - # Useful data exploration libraries (for missing data and generating reports) - pip install missingno && \ - pip install pandas-profiling && \ - pip install s2sphere && \ - pip install bayesian-optimization && \ - pip install matplotlib-venn && \ - pip install pyldavis && \ - pip install mlxtend && \ - pip install altair && \ - # b/183944405 pystan 3.x is not compatible with fbprophet. - pip install pystan==2.19.1.1 && \ - pip install ImageHash && \ - pip install ecos && \ - pip install CVXcanon && \ - # b/179264579 cvxpy 1.1.8 requires numpy >= 1.20 - pip install cvxpy==1.1.7 && \ - pip install fancyimpute && \ - pip install pymc3 && \ - pip install imagecodecs && \ - pip install tifffile && \ - pip install spectral && \ - pip install descartes && \ - pip install geojson && \ - pip install pydicom && \ - pip install wavio && \ - pip install SimpleITK && \ - pip install hmmlearn && \ - pip install bayespy && \ - pip install gplearn && \ - pip install PyAstronomy && \ - pip install squarify && \ - pip install fuzzywuzzy && \ - pip install python-louvain && \ - pip install pyexcel-ods && \ - pip install sklearn-pandas && \ - pip install stemming && \ - pip install fbprophet && \ - pip install holoviews && \ - pip install geoviews && \ - pip install hypertools && \ - pip install py_stringsimjoin && \ - pip install mlens && \ - pip install scikit-multilearn && \ - pip install cleverhans && \ - pip install leven && \ - pip install catboost && \ - pip install lightfm && \ - pip install folium && \ - pip install scikit-plot && \ - # dipy requires the optional fury dependency for visualizations. - pip install fury dipy && \ - pip install plotnine && \ - pip install scikit-surprise && \ - pip install pymongo && \ - pip install geoplot && \ - pip install eli5 && \ - pip install implicit && \ - pip install kaggle && \ - /tmp/clean-layer.sh - -RUN pip install tensorpack && \ - # Add google PAIR-code Facets - cd /opt/ && git clone https://github.com/PAIR-code/facets && cd facets/ && jupyter nbextension install facets-dist/ --user && \ - export PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ && \ - pip install pycountry && \ - pip install iso3166 && \ - pip install pydash && \ - pip install kmodes --no-dependencies && \ - pip install librosa && \ - pip install polyglot && \ - pip install mmh3 && \ - pip install fbpca && \ - pip install sentencepiece && \ - pip install cufflinks && \ - pip install lime && \ - pip install memory_profiler && \ - /tmp/clean-layer.sh - -# install cython & cysignals before pyfasttext -RUN pip install --upgrade cython && \ - pip install --upgrade cysignals && \ - pip install pyfasttext && \ - # ktext has an explicit dependency on Keras 2.2.4 which is not - # compatible with TensorFlow 2.0 (support was added in Keras 2.3.0). - # Add the package back once it is fixed upstream. - # pip install ktext && \ - pip install fasttext && \ - apt-get install -y libhunspell-dev && pip install hunspell && \ - pip install annoy && \ - pip install category_encoders && \ - # google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1 - pip install google-cloud-automl==1.0.1 && \ - pip install google-cloud-bigquery==2.2.0 && \ - pip install google-cloud-storage && \ - pip install google-cloud-translate==3.* && \ - pip install google-cloud-language==2.* && \ - pip install google-cloud-videointelligence==2.* && \ - pip install google-cloud-vision==2.* && \ - # b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. - pip uninstall -y google-cloud-bigquery-storage && \ - # After launch this should be installed from pip - pip install git+https://github.com/googleapis/python-aiplatform.git@mb-release && \ - pip install ortools && \ - pip install scattertext && \ - # Pandas data reader - pip install pandas-datareader && \ - pip install wordsegment && \ - pip install wordbatch && \ - pip install emoji && \ - # Add Japanese morphological analysis engine - pip install janome && \ - pip install wfdb && \ - pip install vecstack && \ - # yellowbrick machine learning visualization library - pip install yellowbrick && \ - pip install mlcrate && \ - /tmp/clean-layer.sh - -RUN pip install bleach && \ - pip install certifi && \ - pip install cycler && \ - pip install decorator && \ - pip install entrypoints && \ - pip install html5lib && \ - pip install ipykernel && \ - pip install ipython && \ - pip install ipython-genutils && \ - pip install ipywidgets && \ - pip install isoweek && \ - pip install jedi && \ - pip install Jinja2 && \ - pip install jsonschema && \ - pip install jupyter-client && \ - pip install jupyter-console && \ - pip install jupyter-core && \ - pip install MarkupSafe && \ - pip install mistune && \ - pip install nbconvert && \ - pip install nbformat && \ - pip install notebook && \ - pip install papermill && \ - pip install olefile && \ - pip install kornia && \ - pip install pandas_summary && \ - pip install pandocfilters && \ - pip install pexpect && \ - pip install pickleshare && \ - pip install Pillow && \ - # Install openslide and its python binding - apt-get install -y openslide-tools && \ - pip install openslide-python && \ - pip install ptyprocess && \ - pip install Pygments && \ - pip install pyparsing && \ - pip install pytz && \ - pip install PyYAML && \ - pip install pyzmq && \ - pip install qtconsole && \ - pip install six && \ - pip install terminado && \ - pip install tornado && \ - pip install tqdm && \ - pip install traitlets && \ - pip install wcwidth && \ - pip install webencodings && \ - pip install widgetsnbextension && \ - pip install pyarrow && \ - pip install feather-format && \ - pip install fastai && \ - pip install allennlp && \ - # https://b.corp.google.com/issues/184685619#comment9: 3.9.0 is causing a major performance degradation with spacy 2.3.5 - pip install importlib-metadata==3.4.0 && \ - python -m spacy download en_core_web_sm && python -m spacy download en_core_web_lg && \ - apt-get install -y ffmpeg && \ - /tmp/clean-layer.sh - - ########### - # - # NEW CONTRIBUTORS: - # Please add new pip/apt installs in this block. Don't forget a "&& \" at the end - # of all non-final lines. Thanks! - # - ########### - -RUN pip install flashtext && \ - pip install wandb && \ - pip install marisa-trie && \ - pip install pyemd && \ - pip install pyupset && \ - pip install pympler && \ - pip install s3fs && \ - pip install featuretools && \ - pip install -e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper && \ - pip install hpsklearn && \ - pip install git+https://github.com/Kaggle/learntools && \ - pip install kmapper && \ - pip install shap && \ - pip install ray && \ - pip install gym && \ - pip install pyarabic && \ - pip install pandasql && \ - pip install tensorflow_hub && \ - pip install jieba && \ - pip install git+https://github.com/SauceCat/PDPbox && \ - # ggplot is broken and main repo does not merge and release https://github.com/yhat/ggpy/pull/668 - pip install https://github.com/hbasria/ggpy/archive/0.11.5.zip && \ - pip install cesium && \ - pip install rgf_python && \ - # b/185992410: onnx is a dependency of pytext, but the version 1.9.0 breaks pytext test. - # Remove this installation when pytext fixes the problem. - pip install onnx==1.8.1 && \ - # b/145404107: latest version force specific version of numpy and torch. - pip install pytext-nlp==0.1.2 && \ - pip install tsfresh && \ - pip install pykalman && \ - pip install optuna && \ - pip install plotly_express && \ - pip install albumentations && \ - pip install catalyst && \ - pip install osmnx && \ - apt-get -y install libspatialindex-dev && \ - pip install pytorch-ignite && \ - pip install qgrid && \ - pip install bqplot && \ - pip install earthengine-api && \ - pip install transformers && \ - pip install dlib && \ - pip install kaggle-environments && \ - pip install geopandas && \ - pip install nnabla && \ - pip install vowpalwabbit && \ - # papermill can replace nbconvert for executing notebooks - pip install cloud-tpu-client && \ - pip install tensorflow-cloud && \ - pip install tensorflow-datasets && \ - pip install pydub && \ - pip install pydegensac && \ - pip install pytorch-lightning && \ - pip install datatable && \ - pip install sympy && \ - # flask is used by agents in the simulation competitions. - pip install flask && \ - # pycrypto is used by competitions team. - pip install pycrypto && \ - pip install easyocr && \ - # Keep JAX version in sync with GPU image. - pip install jax==0.2.12 jaxlib==0.1.64 && \ - # ipympl adds interactive widget support for matplotlib - pip install ipympl==0.7.0 && \ - pip install pandarallel && \ - /tmp/clean-layer.sh - -# Download base easyocr models. -# https://github.com/JaidedAI/EasyOCR#usage -RUN mkdir -p /root/.EasyOCR/model && \ - wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip" -O /root/.EasyOCR/model/latin.zip && \ - unzip /root/.EasyOCR/model/latin.zip -d /root/.EasyOCR/model/ && \ - rm /root/.EasyOCR/model/latin.zip && \ - wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip" -O /root/.EasyOCR/model/english.zip && \ - unzip /root/.EasyOCR/model/english.zip -d /root/.EasyOCR/model/ && \ - rm /root/.EasyOCR/model/english.zip && \ - wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip" -O /root/.EasyOCR/model/craft_mlt_25k.zip && \ - unzip /root/.EasyOCR/model/craft_mlt_25k.zip -d /root/.EasyOCR/model/ && \ - rm /root/.EasyOCR/model/craft_mlt_25k.zip && \ - /tmp/clean-layer.sh - -# Tesseract and some associated utility packages -RUN apt-get install tesseract-ocr -y && \ - pip install pytesseract && \ - pip install wand && \ - pip install pdf2image && \ - pip install PyPDF && \ - pip install pyocr && \ - /tmp/clean-layer.sh -ENV TESSERACT_PATH=/usr/bin/tesseract - -# For Facets -ENV PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ -# For Theano with MKL -ENV MKL_THREADING_LAYER=GNU - -# Temporary fixes and patches - # Temporary patch for Dask getting downgraded, which breaks Keras -RUN pip install --upgrade dask && \ - # Stop jupyter nbconvert trying to rewrite its folder hierarchy - mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \ - mkdir -p /.jupyter && touch /.jupyter/jupyter_nbconvert_config.py && touch /.jupyter/migrated && \ - # Stop Matplotlib printing junk to the console on first load - sed -i "s/^.*Matplotlib is building the font cache using fc-list.*$/# Warning removed by Kaggle/g" /opt/conda/lib/python3.7/site-packages/matplotlib/font_manager.py && \ - # Make matplotlib output in Jupyter notebooks display correctly - mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py && \ - # Temporary patch for broken libpixman 0.38 in conda-forge, symlink to system libpixman 0.34 untile conda package gets updated to 0.38.5 or higher. - ln -sf /usr/lib/x86_64-linux-gnu/libpixman-1.so.0.34.0 /opt/conda/lib/libpixman-1.so.0.38.0 && \ - /tmp/clean-layer.sh - -# gcloud SDK https://cloud.google.com/sdk/docs/quickstart-debian-ubuntu -RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" \ - | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ - curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \ - apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ - apt-get update -y && apt-get install google-cloud-sdk -y && \ - /tmp/clean-layer.sh - -# Add BigQuery client proxy settings -ENV PYTHONUSERBASE "/root/.local" -ADD patches/kaggle_gcp.py /root/.local/lib/python3.7/site-packages/kaggle_gcp.py -ADD patches/kaggle_secrets.py /root/.local/lib/python3.7/site-packages/kaggle_secrets.py -ADD patches/kaggle_session.py /root/.local/lib/python3.7/site-packages/kaggle_session.py -ADD patches/kaggle_web_client.py /root/.local/lib/python3.7/site-packages/kaggle_web_client.py -ADD patches/kaggle_datasets.py /root/.local/lib/python3.7/site-packages/kaggle_datasets.py -ADD patches/log.py /root/.local/lib/python3.7/site-packages/log.py -ADD patches/sitecustomize.py /root/.local/lib/python3.7/site-packages/sitecustomize.py -# Override default imagemagick policies -ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml - -# TensorBoard Jupyter extension. Should be replaced with TensorBoard's provided magic once we have -# worker tunneling support in place. -# b/139212522 re-enable TensorBoard once solution for slowdown is implemented. -# ENV JUPYTER_CONFIG_DIR "/root/.jupyter/" -# RUN pip install jupyter_tensorboard && \ -# jupyter serverextension enable jupyter_tensorboard && \ -# jupyter tensorboard enable -# ADD patches/tensorboard/notebook.py /opt/conda/lib/python3.7/site-packages/tensorboard/notebook.py - -# Disable unnecessary jupyter extensions -RUN jupyter-nbextension disable nb_conda --py --sys-prefix && \ - jupyter-serverextension disable nb_conda --py --sys-prefix && \ - python -m nb_conda_kernels.install --disable - -# Set backend for matplotlib -ENV MPLBACKEND "agg" - -# We need to redefine TENSORFLOW_VERSION here to get the default ARG value defined above the FROM instruction. -# See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact -ARG TENSORFLOW_VERSION -ARG GIT_COMMIT=unknown -ARG BUILD_DATE=unknown - -LABEL git-commit=$GIT_COMMIT -LABEL build-date=$BUILD_DATE -LABEL tensorflow-version=$TENSORFLOW_VERSION -# Used in the Jenkins `Docker GPU Build` step to restrict the images being pruned. -LABEL kaggle-lang=python - -# Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`. -RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl new file mode 100644 index 00000000..9d661201 --- /dev/null +++ b/Dockerfile.tmpl @@ -0,0 +1,164 @@ +{{ if eq .Accelerator "gpu" }} +FROM us-docker.pkg.dev/colab-images/public/runtime:release-colab-external_20260126-060048_RC00 +{{ else }} +FROM us-docker.pkg.dev/colab-images/public/cpu-runtime:release-colab-external_20260123-060023_RC00 +{{ end}} + +ADD kaggle_requirements.txt /kaggle_requirements.txt + +# Freeze existing requirements from base image for critical packages: +RUN pip freeze | grep -E 'tensorflow|keras|torch|jax' > /colab_requirements.txt + +# Merge requirements files: +RUN cat /colab_requirements.txt >> /requirements.txt +RUN cat /kaggle_requirements.txt >> /requirements.txt + +# Install Kaggle packages +RUN uv pip install --system --no-cache -r /requirements.txt + +# Install manual packages: +# b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. +RUN uv pip uninstall --system --no-cache google-cloud-bigquery-storage + +# uv cannot install this in requirements.txt without --no-build-isolation +# to avoid affecting the larger build, we'll post-install it. +RUN uv pip install --no-build-isolation --no-cache --system "git+https://github.com/Kaggle/learntools" + +# b/404590350: Ray and torchtune have conflicting cli named `tune`. `ray` is not part of Colab's base image. Re-install `tune` to ensure the torchtune CLI is available by default. +# b/468367647: Unpin protobuf, version greater than v5.29.5 causes issues with numerous packages +RUN uv pip install --system --force-reinstall --no-cache --no-deps torchtune +RUN uv pip install --system --force-reinstall --no-cache "protobuf==5.29.5" + +# Adding non-package dependencies: +ADD clean-layer.sh /tmp/clean-layer.sh +ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl +ADD patches/template_conf.json /opt/kaggle/conf.json + +ARG PACKAGE_PATH=/usr/local/lib/python3.12/dist-packages + +# Install GPU-specific non-pip packages. +{{ if eq .Accelerator "gpu" }} +RUN uv pip install --system --no-cache "pycuda" +{{ end }} + + +# Use a fixed apt-get repo to stop intermittent failures due to flaky httpredir connections, +# as described by Lionel Chan at http://stackoverflow.com/a/37426929/5881346 +RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list && \ + apt-get update --allow-releaseinfo-change && \ + # Needed by lightGBM (GPU build) + # https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm + apt-get install -y build-essential unzip cmake libboost-dev libboost-system-dev libboost-filesystem-dev p7zip-full && \ + # b/182601974: ssh client was removed from the base image but is required for packages such as stable-baselines. + apt-get install -y openssh-client && \ + apt-get install -y graphviz && pip install graphviz && \ + /tmp/clean-layer.sh + +ADD patches/keras_internal.py \ + patches/keras_internal_test.py \ + $PACKAGE_PATH/tensorflow_decision_forests/keras/ + +RUN apt-get install -y libfreetype6-dev && \ + apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing && \ + /tmp/clean-layer.sh + +RUN mkdir -p /usr/share/nltk_data && \ + # NLTK Downloader no longer continues smoothly after an error, so we explicitly list + # the corpuses that work + python -m nltk.downloader -d /usr/share/nltk_data abc alpino averaged_perceptron_tagger \ + basque_grammars biocreative_ppi bllip_wsj_no_aux \ + book_grammars brown brown_tei cess_cat cess_esp chat80 city_database cmudict \ + comtrans conll2000 conll2002 conll2007 crubadan dependency_treebank \ + europarl_raw floresta gazetteers genesis gutenberg \ + ieer inaugural indian jeita kimmo knbc large_grammars lin_thesaurus mac_morpho machado \ + masc_tagged maxent_ne_chunker maxent_treebank_pos_tagger moses_sample movie_reviews \ + mte_teip5 names nps_chat omw opinion_lexicon paradigms \ + pil pl196x porter_test ppattach problem_reports product_reviews_1 product_reviews_2 propbank \ + pros_cons ptb punkt punkt_tab qc reuters rslp rte sample_grammars semcor senseval sentence_polarity \ + sentiwordnet shakespeare sinica_treebank smultron snowball_data spanish_grammars \ + state_union stopwords subjectivity swadesh switchboard tagsets timit toolbox treebank \ + twitter_samples udhr2 udhr unicode_samples universal_tagset universal_treebanks_v20 \ + vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe + +RUN apt-get install -y git-lfs && \ + # vtk dependencies + apt-get install -y libgl1-mesa-glx && \ + # xvfbwrapper dependencies + apt-get install -y xvfb && \ + /tmp/clean-layer.sh + +# Download base easyocr models. +# https://github.com/JaidedAI/EasyOCR#usage +RUN mkdir -p /root/.EasyOCR/model && \ + wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip" -O /root/.EasyOCR/model/latin.zip && \ + unzip /root/.EasyOCR/model/latin.zip -d /root/.EasyOCR/model/ && \ + rm /root/.EasyOCR/model/latin.zip && \ + wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip" -O /root/.EasyOCR/model/english.zip && \ + unzip /root/.EasyOCR/model/english.zip -d /root/.EasyOCR/model/ && \ + rm /root/.EasyOCR/model/english.zip && \ + wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip" -O /root/.EasyOCR/model/craft_mlt_25k.zip && \ + unzip /root/.EasyOCR/model/craft_mlt_25k.zip -d /root/.EasyOCR/model/ && \ + rm /root/.EasyOCR/model/craft_mlt_25k.zip && \ + /tmp/clean-layer.sh + +# Tesseract and some associated utility packages +RUN apt-get install tesseract-ocr -y && \ + /tmp/clean-layer.sh + +ENV TESSERACT_PATH=/usr/bin/tesseract \ + # For Facets, we also include an empty path to include $PWD. + PYTHONPATH=:$PYTHONPATH:/opt/facets/facets_overview/python/ \ + # For Theano with MKL + MKL_THREADING_LAYER=GNU + +# Temporary fixes and patches +# Stop jupyter nbconvert trying to rewrite its folder hierarchy +RUN mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \ + mkdir -p /.jupyter && touch /.jupyter/jupyter_nbconvert_config.py && touch /.jupyter/migrated && \ + # Make matplotlib output in Jupyter notebooks display correctly + mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py && \ + /tmp/clean-layer.sh + +# install imagemagick for wand +# https://docs.wand-py.org/en/latest/guide/install.html#install-imagemagick-on-debian-ubuntu +RUN apt-get install libmagickwand-dev && \ + /tmp/clean-layer.sh + +# Override default imagemagick policies +ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml + +# Add Kaggle module resolver +ADD patches/kaggle_module_resolver.py $PACKAGE_PATH/tensorflow_hub/kaggle_module_resolver.py +RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' $PACKAGE_PATH/tensorflow_hub/config.py && \ + sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' $PACKAGE_PATH/tensorflow_hub/config.py + +# Add BigQuery client proxy settings +ENV PYTHONUSERBASE="/root/.local" +ADD patches/kaggle_gcp.py \ + patches/kaggle_secrets.py \ + patches/kaggle_session.py \ + patches/kaggle_web_client.py \ + patches/kaggle_datasets.py \ + $PACKAGE_PATH/ + +# Figure out why this is in a different place? +# Found by doing a export PYTHONVERBOSE=1 and then running python and checking for where it looked for it. +ADD patches/sitecustomize.py /usr/lib/python3.12/sitecustomize.py + +ARG GIT_COMMIT=unknown \ + BUILD_DATE=unknown + +LABEL git-commit=$GIT_COMMIT \ + build-date=$BUILD_DATE + +ENV GIT_COMMIT=${GIT_COMMIT} \ + BUILD_DATE=${BUILD_DATE} + +# Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`. +RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date + +{{ if eq .Accelerator "gpu" }} +# Add the CUDA home. +ENV CUDA_HOME=/usr/local/cuda +{{ end }} +ENTRYPOINT ["/usr/bin/env"] diff --git a/Jenkinsfile b/Jenkinsfile index 74b26dd9..c4af03e6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,4 +1,4 @@ -String cron_string = BRANCH_NAME == "master" ? "H 12 * * 1-5" : "" // Mon-Fri at noon UTC, 8am EST, 5am PDT +String cron_string = BRANCH_NAME == "main" ? "H 12 * * 1-5" : "" // Mon-Fri at noon UTC, 8am EST, 5am PDT pipeline { agent { label 'ephemeral-linux' } @@ -14,103 +14,159 @@ pipeline { GIT_COMMIT_SUBJECT = sh(returnStdout: true, script:"git log --format=%s -n 1 HEAD").trim() GIT_COMMIT_AUTHOR = sh(returnStdout: true, script:"git log --format='%an' -n 1 HEAD").trim() GIT_COMMIT_SUMMARY = "`` ${GIT_COMMIT_SUBJECT} - ${GIT_COMMIT_AUTHOR}" - SLACK_CHANNEL = sh(returnStdout: true, script: "if [[ \"${GIT_BRANCH}\" == \"master\" ]]; then echo \"#kernelops\"; else echo \"#builds\"; fi").trim() - PRETEST_TAG = sh(returnStdout: true, script: "if [[ \"${GIT_BRANCH}\" == \"master\" ]]; then echo \"ci-pretest\"; else echo \"${GIT_BRANCH}-pretest\"; fi").trim() - STAGING_TAG = sh(returnStdout: true, script: "if [[ \"${GIT_BRANCH}\" == \"master\" ]]; then echo \"staging\"; else echo \"${GIT_BRANCH}-staging\"; fi").trim() + MATTERMOST_CHANNEL = sh(returnStdout: true, script: "if [[ \"${GIT_BRANCH}\" == \"main\" ]]; then echo \"#kernelops\"; else echo \"#builds\"; fi").trim() + // Use dev branch names as tags, but replace '/' with '-' using sed since docker images don't support forward slash + PRETEST_TAG = sh(returnStdout: true, script: "if [[ \"${GIT_BRANCH}\" == \"main\" ]]; then echo \"ci-pretest\"; else echo \"${GIT_BRANCH}-pretest\" | sed 's/\\//-/g'; fi").trim() + STAGING_TAG = sh(returnStdout: true, script: "if [[ \"${GIT_BRANCH}\" == \"main\" ]]; then echo \"staging\"; else echo \"${GIT_BRANCH}-staging\" | sed 's/\\//-/g'; fi").trim() } stages { - stage('Docker CPU Build') { - options { - timeout(time: 120, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail + stage('Build/Test/Diff') { + parallel { + stage('CPU') { + stages { + stage('Build CPU Image') { + options { + timeout(time: 120, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail - ./build | ts - ./push ${PRETEST_TAG} - ''' - } - } + ./build | ts + ./push ${PRETEST_TAG} + ''' + } + } + stage('Diff CPU image') { + steps { + sh '''#!/bin/bash + set -exo pipefail - stage('Test CPU Image') { - options { - timeout(time: 5, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail + docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} + ./diff --target gcr.io/kaggle-images/python:${PRETEST_TAG} + ''' + } + } + } + } + stage('GPU') { + agent { label 'ephemeral-linux-gpu' } + stages { + stage('Build GPU Image') { + options { + timeout(time: 4324, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail + # Remove images (dangling or not) created more than 72h (3 days ago) to prevent the GPU agent disk from filling up. + # Note: CPU agents are ephemeral and do not need to have their disk cleaned up. + docker image prune --all --force --filter "until=72h" --filter "label=kaggle-lang=python" + # Remove any dangling images (no tags). + # All builds for the same branch uses the same tag. This means a subsequent build for the same branch + # will untag the previously built image which is safe to do. Builds for a single branch are performed + # serially. + docker image prune -f + + ./build --gpu | ts + ./push --gpu ${PRETEST_TAG} + ''' + } + } + stage('Diff GPU Image') { + steps { + sh '''#!/bin/bash + set -exo pipefail - date - ./test --image gcr.io/kaggle-images/python:${PRETEST_TAG} - ''' - } - } - - stage('Docker GPU Build') { - // A GPU is not required to build this image. However, in our current setup, - // the default runtime is set to nvidia (as opposed to runc) and there - // is no option to specify a runtime for the `docker build` command. - // - // TODO(rosbo) don't set `nvidia` as the default runtime and use the - // `--runtime=nvidia` flag for the `docker run` command when GPU support is needed. - agent { label 'ephemeral-linux-gpu' } - options { - timeout(time: 60, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - # Remove images (dangling or not) created more than 120h (5 days ago) to prevent disk from filling up. - docker image prune --all --force --filter "until=120h" --filter "label=kaggle-lang=python" - # Remove any dangling images (no tags). - # All builds for the same branch uses the same tag. This means a subsequent build for the same branch - # will untag the previously built image which is safe to do. Builds for a single branch are performed - # serially. - docker image prune -f - ./build --gpu --base-image-tag ${PRETEST_TAG} | ts - ./push --gpu ${PRETEST_TAG} - ''' - } - } + docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./diff --gpu --target gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ''' + } + } + } + } + stage('TPU VM') { + agent { label 'ephemeral-linux' } + stages { + stage('Build TPU VM Image') { + options { + timeout(time: 60, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail - stage('Test GPU Image') { - agent { label 'ephemeral-linux-gpu' } - options { - timeout(time: 20, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail + ./tpu/build | ts + ./push --tpu ${PRETEST_TAG} + ''' + } + } + stage('Diff TPU VM Image') { + steps { + sh '''#!/bin/bash + set -exo pipefail - date - ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ''' + docker pull gcr.io/kaggle-private-byod/python-tpuvm:${PRETEST_TAG} + ./diff --tpu --target gcr.io/kaggle-private-byod/python-tpuvm:${PRETEST_TAG} + ''' + } + } + } + } } } - stage('Package Versions') { + stage('Test') { parallel { - stage('CPU Diff') { + stage('Test CPU Image') { + options { + timeout(time: 15, unit: 'MINUTES') + } steps { - sh '''#!/bin/bash - set -exo pipefail + retry(2) { + sh '''#!/bin/bash + set -exo pipefail - docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} - ./diff --target gcr.io/kaggle-images/python:${PRETEST_TAG} - ''' + date + docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} + ./test --image gcr.io/kaggle-images/python:${PRETEST_TAG} + ''' + } } } - stage('GPU Diff') { + stage('Test on P100') { agent { label 'ephemeral-linux-gpu' } + options { + timeout(time: 40, unit: 'MINUTES') + } + steps { + retry(2) { + sh '''#!/bin/bash + set -exo pipefail + + date + docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ''' + } + } + } + stage('Test on T4x2') { + agent { label 'ephemeral-linux-gpu-t4x2' } + options { + timeout(time: 60, unit: 'MINUTES') + } steps { - sh '''#!/bin/bash - set -exo pipefail + retry(2) { + sh '''#!/bin/bash + set -exo pipefail - docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ./diff --gpu --target gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ''' + date + docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ''' + } } } } @@ -123,6 +179,7 @@ pipeline { gcloud container images add-tag gcr.io/kaggle-images/python:${PRETEST_TAG} gcr.io/kaggle-images/python:${STAGING_TAG} gcloud container images add-tag gcr.io/kaggle-private-byod/python:${PRETEST_TAG} gcr.io/kaggle-private-byod/python:${STAGING_TAG} + # NOTE(b/336842777): TPUVM images are tested on an actual TPU VM outside this pipeline, so they are not auto-promoted to :staging tag. ''' } } @@ -130,16 +187,13 @@ pipeline { post { failure { - slackSend color: 'danger', message: "*<${env.BUILD_URL}console|${JOB_NAME} failed>* ${GIT_COMMIT_SUMMARY} @kernels-backend-ops", channel: env.SLACK_CHANNEL - mattermostSend color: 'danger', message: "*<${env.BUILD_URL}console|${JOB_NAME} failed>* ${GIT_COMMIT_SUMMARY} @kernels-backend-ops", channel: env.SLACK_CHANNEL + mattermostSend color: 'danger', message: "*<${env.BUILD_URL}console|${JOB_NAME} failed>* ${GIT_COMMIT_SUMMARY} @dockerops", channel: env.MATTERMOST_CHANNEL } success { - slackSend color: 'good', message: "*<${env.BUILD_URL}console|${JOB_NAME} passed>* ${GIT_COMMIT_SUMMARY}", channel: env.SLACK_CHANNEL - mattermostSend color: 'good', message: "*<${env.BUILD_URL}console|${JOB_NAME} passed>* ${GIT_COMMIT_SUMMARY} @kernels-backend-ops", channel: env.SLACK_CHANNEL + mattermostSend color: 'good', message: "*<${env.BUILD_URL}console|${JOB_NAME} passed>* ${GIT_COMMIT_SUMMARY} @dockerops", channel: env.MATTERMOST_CHANNEL } aborted { - slackSend color: 'warning', message: "*<${env.BUILD_URL}console|${JOB_NAME} aborted>* ${GIT_COMMIT_SUMMARY}", channel: env.SLACK_CHANNEL - mattermostSend color: 'warning', message: "*<${env.BUILD_URL}console|${JOB_NAME} aborted>* ${GIT_COMMIT_SUMMARY} @kernels-backend-ops", channel: env.SLACK_CHANNEL + mattermostSend color: 'warning', message: "*<${env.BUILD_URL}console|${JOB_NAME} aborted>* ${GIT_COMMIT_SUMMARY} @dockerops", channel: env.MATTERMOST_CHANNEL } } } diff --git a/README.md b/README.md index 44be6c06..315e7db2 100644 --- a/README.md +++ b/README.md @@ -2,19 +2,13 @@ [Kaggle Notebooks](https://www.kaggle.com/notebooks) allow users to run a Python Notebook in the cloud against our competitions and datasets without having to download data or set up their environment. -This repository includes our Dockerfiles for building the [CPU-only](Dockerfile) and [GPU](gpu.Dockerfile) image that runs Python Notebooks on Kaggle. +This repository includes the [Dockerfile](Dockerfile.tmpl) for building the CPU-only and GPU image that runs Python Notebooks on Kaggle. -Our Python Docker images are stored on Google Container Registry at: +Our Python Docker images are stored on the Google Container Registry at: * CPU-only: [gcr.io/kaggle-images/python](https://gcr.io/kaggle-images/python) * GPU: [gcr.io/kaggle-gpu-images/python](https://gcr.io/kaggle-gpu-images/python) -Note: The base image for the GPU image is our CPU-only image. The [gpu.Dockerfile](gpu.Dockerfile) adds a few extra layers to install GPU related libraries and packages (cuda, libcudnn, pycuda etc.) and reinstall packages with specific GPU builds (torch, tensorflow and a few mores). - -## Getting started - -To get started with this image, read our [guide](https://medium.com/@kaggleteam/how-to-get-started-with-data-science-in-containers-6ed48cb08266) to using it yourself, or browse [Kaggle Notebooks](https://www.kaggle.com/notebooks) for ideas. - ## Requesting new packages First, evaluate whether installing the package yourself in your own notebooks suits your needs. See [guide](https://github.com/Kaggle/docker-python/wiki/Missing-Packages). @@ -23,11 +17,9 @@ If you the first step above doesn't work for your use case, [open an issue](http ## Opening a pull request -1. Update the *Dockerfile* - 1. For changes specific to the GPU image, update the [gpu.Dockerfile](gpu.Dockerfile). - 1. Otherwise, update the [Dockerfile](Dockerfile). +1. Edit [kaggle_requirements.txt](kaggle_requirements.txt). 1. Follow the instructions below to build a new image. -1. Add tests for your new package. See this [example](https://github.com/Kaggle/docker-python/blob/master/tests/test_fastai.py). +1. Add tests for your new package. See this [example](https://github.com/Kaggle/docker-python/blob/main/tests/test_fastai.py). 1. Follow the instructions below to test the new image. 1. Open a PR on this repo and you are all set! @@ -53,6 +45,8 @@ A suite of tests can be found under the `/tests` folder. You can run the test us Flags: * `--gpu` to test the GPU image. +* `--pattern test_keras.py` or `-p test_keras.py` to run a single test +* `--image gcr.io/kaggle-images/python:ci-pretest` or `-i gcr.io/kaggle-images/python:ci-pretest` to test against a specific image ## Running the image @@ -75,16 +69,3 @@ docker run --runtime nvidia --rm -it gcr.io/kaggle-gpu-images/python /bin/bash ``` To ensure your container can access the GPU, follow the instructions posted [here](https://github.com/Kaggle/docker-python/issues/361#issuecomment-448093930). - -## Tensorflow custom pre-built wheel - -A Tensorflow custom pre-built wheel is used mainly for: - -* Faster build time: Building tensorflow from sources takes ~1h. Keeping this process outside the main build allows faster iterations when working on our Dockerfiles. - -Building Tensorflow from sources: - -* Increase performance: When building from sources, we can leverage CPU specific optimizations -* Is required: Tensorflow with GPU support must be built from sources - -The [Dockerfile](tensorflow-whl/Dockerfile) and the [instructions](tensorflow-whl/README.md) can be found in the [tensorflow-whl folder/](tensorflow-whl/). diff --git a/build b/build index ae9a9779..83bbe577 100755 --- a/build +++ b/build @@ -9,12 +9,12 @@ Build a new Python Docker image. Options: -g, --gpu Build an image with GPU support. -c, --use-cache Use layer cache when building a new image. - -b, --base-image-tag TAG Base image tag. Defaults to value defined in DOCKERFILE. EOF } CACHE_FLAG='--no-cache' DOCKERFILE='Dockerfile' +ACCELERATOR='none' IMAGE_TAG='kaggle/python-build' BUILD_ARGS='' @@ -27,19 +27,11 @@ while :; do -g|--gpu) IMAGE_TAG='kaggle/python-gpu-build' DOCKERFILE='gpu.Dockerfile' + ACCELERATOR='gpu' ;; -c|--use-cache) CACHE_FLAG='' ;; - -b|--base-image-tag) - if [[ -z $2 ]]; then - usage - printf 'ERROR: No TAG specified after the %s flag.\n' "$1" >&2 - exit - fi - BUILD_ARGS="--build-arg BASE_TAG=$2" - shift # skip the flag value - ;; -?*) usage printf 'ERROR: Unknown option: %s\n' "$1" >&2 @@ -57,8 +49,21 @@ BUILD_ARGS+=" --build-arg BUILD_DATE=$(date '+%Y%m%d-%H%M%S')" readonly CACHE_FLAG readonly DOCKERFILE +readonly ACCELERATOR readonly IMAGE_TAG readonly BUILD_ARGS +SRCDIR=$(dirname "${BASH_SOURCE[0]}") +DOCKERFILE_OUTDIR="${SRCDIR}/.generated" +mkdir -p $DOCKERFILE_OUTDIR +DOCKERFILE_PATH="$DOCKERFILE_OUTDIR/$DOCKERFILE" + +# Generate Dockerfile from template. +echo "Generating Dockerfile from template..." +docker run --rm -v $PWD:/input:ro gcr.io/kaggle-images/go-renderizer:latest --ACCELERATOR=$ACCELERATOR /input/Dockerfile.tmpl > $DOCKERFILE_PATH +echo "==================== $DOCKERFILE START ====================" +cat $DOCKERFILE_PATH +echo "==================== $DOCKERFILE END ====================" + set -x -docker build --rm --pull $CACHE_FLAG -t "$IMAGE_TAG" -f "$DOCKERFILE" $BUILD_ARGS . +docker build --rm --pull $CACHE_FLAG -t "$IMAGE_TAG" -f "$DOCKERFILE_PATH" $BUILD_ARGS . diff --git a/clean-layer.sh b/clean-layer.sh index d1a048fc..9a50e7bf 100755 --- a/clean-layer.sh +++ b/clean-layer.sh @@ -10,8 +10,6 @@ set -e set -x -# Delete files that pip caches when installing a package. -rm -rf /root/.cache/pip/* # Delete old downloaded archive files apt-get autoremove -y # Delete downloaded archive files @@ -19,6 +17,4 @@ apt-get clean # Ensures the current working directory won't be deleted cd /usr/local/src/ # Delete source files used for building binaries -rm -rf /usr/local/src/* -# Delete conda downloaded tarballs -conda clean -y --tarballs +rm -rf /usr/local/src/* \ No newline at end of file diff --git a/diff b/diff index cd0fc732..c8251703 100755 --- a/diff +++ b/diff @@ -32,6 +32,10 @@ while :; do BASE_IMAGE_TAG='gcr.io/kaggle-private-byod/python:latest' TARGET_IMAGE_TAG='kaggle/python-gpu-build' ;; + -x|--tpu) + BASE_IMAGE_TAG='gcr.io/kaggle-private-byod/python-tpuvm:latest' + TARGET_IMAGE_TAG='kaggle/python-tpuvm-build' + ;; -b|--base) if [[ -z "$2" ]]; then usage @@ -94,13 +98,13 @@ if [[ -n "$PACKAGE_NAME" ]]; then echo "Package: $PACKAGE_NAME" CMDS=("python /tools/pip_list_versions.py $PACKAGE_NAME | sort") else - CMDS=("pip freeze" 'cat /etc/os-release | grep -oP "PRETTY_NAME=\"\K([^\"]*)"' "uname -r" "dpkg --list | awk '{print \$2\"==\"\$3}'") + CMDS=("pip list --format=freeze" 'cat /etc/os-release | grep -oP "PRETTY_NAME=\"\K([^\"]*)"' "uname -r" "dpkg --list | awk '{print \$2\"==\"\$3}'" "printenv | sort") fi for cmd in "${CMDS[@]}"; do echo "== Comparing $cmd ==" diff --suppress-common-lines --side-by-side \ - <(docker run -v $PWD/tools:/tools --rm "$BASE_IMAGE_TAG" /bin/bash -c "$cmd") \ - <(docker run -v $PWD/tools:/tools --rm "$TARGET_IMAGE_TAG" /bin/bash -c "$cmd") \ + <(docker run -v $PWD/tools:/tools --entrypoint bash --rm "$BASE_IMAGE_TAG" -c "$cmd") \ + <(docker run -v $PWD/tools:/tools --entrypoint bash --rm "$TARGET_IMAGE_TAG" -c "$cmd") \ && echo 'No diff' || true done diff --git a/gpu.Dockerfile b/gpu.Dockerfile deleted file mode 100644 index b40d0807..00000000 --- a/gpu.Dockerfile +++ /dev/null @@ -1,111 +0,0 @@ -ARG BASE_TAG=staging - -FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04 AS nvidia -FROM gcr.io/kaggle-images/python-tensorflow-whl:2.4.1-py37-2 as tensorflow_whl -FROM gcr.io/kaggle-images/python:${BASE_TAG} - -ADD clean-layer.sh /tmp/clean-layer.sh - -# Cuda support -COPY --from=nvidia /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/ -COPY --from=nvidia /etc/apt/sources.list.d/nvidia-ml.list /etc/apt/sources.list.d/ -COPY --from=nvidia /etc/apt/trusted.gpg /etc/apt/trusted.gpg.d/cuda.gpg -# See b/142337634#comment28 -RUN sed -i 's/deb https:\/\/developer.download.nvidia.com/deb http:\/\/developer.download.nvidia.com/' /etc/apt/sources.list.d/*.list - -# Ensure the cuda libraries are compatible with the custom Tensorflow wheels. -# TODO(b/120050292): Use templating to keep in sync or COPY installed binaries from it. -ENV CUDA_MAJOR_VERSION=11 -ENV CUDA_MINOR_VERSION=0 -ENV CUDA_VERSION=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION -LABEL com.nvidia.volumes.needed="nvidia_driver" -LABEL com.nvidia.cuda.version="${CUDA_VERSION}" -ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/opt/bin:${PATH} -# The stub is useful to us both for built-time linking and run-time linking, on CPU-only systems. -# When intended to be used with actual GPUs, make sure to (besides providing access to the host -# CUDA user libraries, either manually or through the use of nvidia-docker) exclude them. One -# convenient way to do so is to obscure its contents by a bind mount: -# docker run .... -v /non-existing-directory:/usr/local/cuda/lib64/stubs:ro ... -ENV LD_LIBRARY_PATH_NO_STUBS="/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH" -ENV LD_LIBRARY_PATH="/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH" -ENV NVIDIA_VISIBLE_DEVICES=all -ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility -ENV NVIDIA_REQUIRE_CUDA="cuda>=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION" -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-cupti-$CUDA_VERSION \ - cuda-cudart-$CUDA_VERSION \ - cuda-cudart-dev-$CUDA_VERSION \ - cuda-libraries-$CUDA_VERSION \ - cuda-libraries-dev-$CUDA_VERSION \ - cuda-nvml-dev-$CUDA_VERSION \ - cuda-minimal-build-$CUDA_VERSION \ - cuda-command-line-tools-$CUDA_VERSION \ - libcudnn8=8.0.4.30-1+cuda$CUDA_VERSION \ - libcudnn8-dev=8.0.4.30-1+cuda$CUDA_VERSION \ - libnccl2=2.7.8-1+cuda$CUDA_VERSION \ - libnccl-dev=2.7.8-1+cuda$CUDA_VERSION && \ - ln -s /usr/local/cuda-$CUDA_VERSION /usr/local/cuda && \ - ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \ - /tmp/clean-layer.sh - -# Install OpenCL & libboost (required by LightGBM GPU version) -RUN apt-get install -y ocl-icd-libopencl1 clinfo libboost-all-dev && \ - mkdir -p /etc/OpenCL/vendors && \ - echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ - /tmp/clean-layer.sh - -# When using pip in a conda environment, conda commands should be ran first and then -# the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/ -# However, because this image is based on the CPU image, this isn't possible but better -# to put them at the top of this file to minize conflicts. -RUN conda remove --force -y pytorch torchvision torchaudio cpuonly && \ - conda install "pytorch=1.7" "torchvision=0.8" "torchaudio=0.7" "torchtext=0.8" "cudf=0.16" "cuml=0.16" cudatoolkit=$CUDA_VERSION && \ - /tmp/clean-layer.sh - -# Install LightGBM with GPU -RUN pip uninstall -y lightgbm && \ - cd /usr/local/src && \ - git clone --recursive https://github.com/microsoft/LightGBM && \ - cd LightGBM && \ - git checkout tags/v3.2.0 && \ - mkdir build && cd build && \ - cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \ - make -j$(nproc) && \ - cd /usr/local/src/LightGBM/python-package && \ - python setup.py install --precompile && \ - mkdir -p /etc/OpenCL/vendors && \ - echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ - /tmp/clean-layer.sh - -# Install JAX (Keep JAX version in sync with CPU image) -RUN pip install jax==0.2.12 jaxlib==0.1.64+cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION -f https://storage.googleapis.com/jax-releases/jax_releases.html && \ - /tmp/clean-layer.sh - -# Reinstall packages with a separate version for GPU support. -COPY --from=tensorflow_whl /tmp/tensorflow_gpu/*.whl /tmp/tensorflow_gpu/ -RUN pip uninstall -y tensorflow && \ - pip install /tmp/tensorflow_gpu/tensorflow*.whl && \ - rm -rf /tmp/tensorflow_gpu && \ - pip uninstall -y mxnet && \ - pip install mxnet-cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \ - /tmp/clean-layer.sh - - # Reinstall TensorFlow addons (TFA) with GPU support. -COPY --from=tensorflow_whl /tmp/tfa_gpu/*.whl /tmp/tfa_gpu/ -RUN pip install /tmp/tfa_gpu/tensorflow*.whl && \ - rm -rf /tmp/tfa_gpu/ && \ - /tmp/clean-layer.sh - -# Install GPU-only packages -RUN pip install pycuda && \ - pip install cupy-cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \ - pip install pynvrtc && \ - pip install nnabla-ext-cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \ - /tmp/clean-layer.sh - -# Re-add TensorBoard Jupyter extension patch -# b/139212522 re-enable TensorBoard once solution for slowdown is implemented. -# ADD patches/tensorboard/notebook.py /opt/conda/lib/python3.7/site-packages/tensorboard/notebook.py - -# Remove the CUDA stubs. -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH_NO_STUBS" diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt new file mode 100644 index 00000000..30e0683f --- /dev/null +++ b/kaggle_requirements.txt @@ -0,0 +1,128 @@ +# Please keep this in alphabetical order +Boruta +Cartopy +ImageHash +Janome +PyArabic +PyUpSet +Pympler +Rtree +shapely +SimpleITK +TPOT +Wand +bayesian-optimization +boto3 +catboost +category-encoders +cesium +comm +cytoolz +# Older versions of datasets fail with "Loading a dataset cached in a LocalFileSystem is not supported" +# https://stackoverflow.com/questions/77433096/notimplementederror-loading-a-dataset-cached-in-a-localfilesystem-is-not-suppor +datasets>=2.14.6 +deap +dipy +docker +easyocr +emoji +fasttext +featuretools +fiona +fury +fuzzywuzzy +geojson +gensim +# b/443054743,b/455550872 +google-adk[a2a,eval]>=1.21.0 +google-cloud-aiplatform +google-cloud-videointelligence +google-cloud-vision +google-genai +gpxpy +h2o +haversine +hep-ml +igraph +ipympl +ipywidgets==8.1.5 +isoweek +jedi +# jitler 0.11.1 breaks simulation image +jiter==0.10.0 +# b/276358430: fix Jupyter lsp freezing up the jupyter server +jupyter-lsp==1.5.1 +# b/333854354: pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354) +jupyter_server==2.12.5 +jupyter_server_proxy +jupyterlab +jupyterlab-lsp +kaggle>=1.8.3 +kaggle-environments +kagglehub[pandas-datasets,hf-datasets,signing]>=0.4.2 +keras-cv +keras-nlp +keras-tuner +kornia +langid +libpysal +lime +line_profiler +mamba +matplotlib +mlcrate +mne +mpld3 +# b/274619697: learntools requires a specific nbconvert right now +nbconvert==6.4.5 +nbdev +nilearn +olefile +onnx +openslide-bin +openslide-python +optuna +pandas-profiling +pandasql +papermill +path +path.py +pdf2image +plotly-express +pudb +pyLDAvis +pycryptodome +pydicom +pyemd +pyexcel-ods +pymongo +pypdf +pytesseract +python-lsp-server +pytorch-ignite +pytorch-lightning +qgrid +qtconsole +ray +rgf-python +s3fs +scikit-learn +# Scikit-learn accelerated library for x86 +scikit-learn-intelex>=2023.0.1 +scikit-multilearn +scikit-optimize +scikit-plot +scikit-surprise +git+https://github.com/facebookresearch/segment-anything.git +squarify +tensorflow-io +# Must be compatible with torch version: https://github.com/meta-pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec +torchcodec==0.9 +torchinfo +torchmetrics +torchtune +transformers>=5.0.0 +vtk +wavio +xvfbwrapper +ydata-profiling diff --git a/patches/kaggle_datasets.py b/patches/kaggle_datasets.py index f5134673..e60db2ef 100644 --- a/patches/kaggle_datasets.py +++ b/patches/kaggle_datasets.py @@ -1,7 +1,12 @@ import os +import sys +from os import listdir +from os.path import isdir, join from kaggle_web_client import KaggleWebClient _KAGGLE_TPU_NAME_ENV_VAR_NAME = 'TPU_NAME' +_KAGGLE_TPUVM_NAME_ENV_VAR_NAME = 'ISTPUVM' +_KAGGLE_INPUT_DIR = '/kaggle/input' class KaggleDatasets: GET_GCS_PATH_ENDPOINT = '/requests/CopyDatasetVersionToKnownGcsBucketRequest' @@ -14,8 +19,20 @@ class KaggleDatasets: def __init__(self): self.web_client = KaggleWebClient() self.has_tpu = os.getenv(_KAGGLE_TPU_NAME_ENV_VAR_NAME) is not None + self.has_tpuvm = os.getenv(_KAGGLE_TPUVM_NAME_ENV_VAR_NAME) is not None def get_gcs_path(self, dataset_dir: str = None) -> str: + if self.has_tpuvm: + if dataset_dir is None: + onlydirs = [f for f in listdir(_KAGGLE_INPUT_DIR) if isdir(join(_KAGGLE_INPUT_DIR, f))] + if len(onlydirs) == 1: + dataset_dir = onlydirs[0] + else: + raise Exception("Could not infer dataset_dir. dataset_dir can only be inferred if there is exactly 1 Kaggle dataset attached.") + dataset = join(_KAGGLE_INPUT_DIR, dataset_dir) + print("get_gcs_path is not required on TPU VMs which can directly use Kaggle datasets, using path: " + dataset, file=sys.stderr) + return dataset + integration_type = self.TPU if self.has_tpu else self.AUTO_ML data = { 'MountSlug': dataset_dir, diff --git a/patches/kaggle_gcp.py b/patches/kaggle_gcp.py index 07179fa9..4cb98858 100644 --- a/patches/kaggle_gcp.py +++ b/patches/kaggle_gcp.py @@ -1,5 +1,6 @@ import os import inspect +import logging from google.auth import credentials, environment_vars from google.auth.exceptions import RefreshError from google.api_core.gapic_v1.client_info import ClientInfo @@ -8,8 +9,6 @@ from google.cloud.bigquery._http import Connection from kaggle_secrets import GcpTarget, UserSecretsClient -from log import Log - KAGGLE_GCP_CLIENT_USER_AGENT="kaggle-gcp-client/1.0" def get_integrations(): @@ -22,7 +21,7 @@ def get_integrations(): target = GcpTarget[integration.upper()] kernel_integrations.add_integration(target) except KeyError as e: - Log.error(f"Unknown integration target: {integration.upper()}") + logging.debug(f"Unknown integration target: {integration.upper()}") return kernel_integrations @@ -66,14 +65,14 @@ def refresh(self, request): elif self.target == GcpTarget.CLOUDAI: self.token, self.expiry = client._get_cloudai_access_token() except ConnectionError as e: - Log.error(f"Connection error trying to refresh access token: {e}") + logging.error(f"Connection error trying to refresh access token: {e}") print("There was a connection error trying to fetch the access token. " f"Please ensure internet is on in order to use the {self.target.service} Integration.") raise RefreshError('Unable to refresh access token due to connection error.') from e except Exception as e: - Log.error(f"Error trying to refresh access token: {e}") + logging.error(f"Error trying to refresh access token: {e}") if (not get_integrations().has_integration(self.target)): - Log.error(f"No {self.target.service} integration found.") + logging.error(f"No {self.target.service} integration found.") print( f"Please ensure you have selected a {self.target.service} account in the Notebook Add-ons menu.") raise RefreshError('Unable to refresh access token.') from e @@ -102,7 +101,7 @@ def api_request(self, *args, **kwargs): msg = ("Permission denied using Kaggle's public BigQuery integration. " "Did you mean to select a BigQuery account in the Notebook Add-ons menu?") print(msg) - Log.info(msg) + logging.info(msg) raise e @@ -156,23 +155,23 @@ def monkeypatch_bq(bq_client, *args, **kwargs): # Remove these two lines once this is resolved: # https://github.com/googleapis/google-cloud-python/issues/8108 if explicit_project_id: - Log.info(f"Explicit project set to {explicit_project_id}") + logging.info(f"Explicit project set to {explicit_project_id}") kwargs['project'] = explicit_project_id if explicit_project_id is None and specified_credentials is None and not has_bigquery: msg = "Using Kaggle's public dataset BigQuery integration." - Log.info(msg) + logging.info(msg) print(msg) return PublicBigqueryClient(*args, **kwargs) else: if specified_credentials is None: - Log.info("No credentials specified, using KaggleKernelCredentials.") + logging.info("No credentials specified, using KaggleKernelCredentials.") kwargs['credentials'] = KaggleKernelCredentials() if (not has_bigquery): - Log.info("No bigquery integration found, creating client anyways.") + logging.info("No bigquery integration found, creating client anyways.") print('Please ensure you have selected a BigQuery ' 'account in the Notebook Add-ons menu.') if explicit_project_id is None: - Log.info("No project specified while using the unmodified client.") + logging.info("No project specified while using the unmodified client.") print('Please ensure you specify a project id when creating the client' ' in order to use your BigQuery account.') kwargs['client_info'] = set_kaggle_user_agent(kwargs.get('client_info')) @@ -187,29 +186,29 @@ def monkeypatch_bq(bq_client, *args, **kwargs): bq_client, *args, **kwargs) return bigquery -# Monkey patch classes that use the init method +# Monkey patch for aiplatform init # eg # from google.cloud import aiplatform # aiplatform.init(args) -def monkeypatch_init(client_klass, kaggle_kernel_credentials): - client_init = client_klass.init - def patched_init(self, *args, **kwargs): +def monkeypatch_aiplatform_init(aiplatform_klass, kaggle_kernel_credentials): + aiplatform_init = aiplatform_klass.init + def patched_init(*args, **kwargs): specified_credentials = kwargs.get('credentials') if specified_credentials is None: - Log.info("No credentials specified, using KaggleKernelCredentials.") + logging.info("No credentials specified, using KaggleKernelCredentials.") kwargs['credentials'] = kaggle_kernel_credentials - return client_init(self, *args, **kwargs) + return aiplatform_init(*args, **kwargs) - if (not has_been_monkeypatched(client_klass.init)): - client_klass.init = patched_init - Log.info(f"Client patched: {client_klass}") + if (not has_been_monkeypatched(aiplatform_klass.init)): + aiplatform_klass.init = patched_init + logging.info("aiplatform.init patched") def monkeypatch_client(client_klass, kaggle_kernel_credentials): client_init = client_klass.__init__ def patched_init(self, *args, **kwargs): specified_credentials = kwargs.get('credentials') if specified_credentials is None: - Log.info("No credentials specified, using KaggleKernelCredentials.") + logging.info("No credentials specified, using KaggleKernelCredentials.") # Some GCP services demand the billing and target project must be the same. # To avoid using default service account based credential as caller credential # user need to provide ClientOptions with quota_project_id: @@ -227,7 +226,7 @@ def patched_init(self, *args, **kwargs): if (not has_been_monkeypatched(client_klass.__init__)): client_klass.__init__ = patched_init - Log.info(f"Client patched: {client_klass}") + logging.info(f"Client patched: {client_klass}") def set_kaggle_user_agent(client_info: ClientInfo): # Add kaggle client user agent in order to attribute usage. @@ -253,37 +252,6 @@ def init_gcs(): KaggleKernelCredentials(target=GcpTarget.GCS)) return storage -def init_automl(): - from google.cloud import automl, automl_v1beta1 - if not is_user_secrets_token_set(): - return - - from kaggle_gcp import get_integrations - if not get_integrations().has_cloudai(): - return - - from kaggle_secrets import GcpTarget - from kaggle_gcp import KaggleKernelCredentials - kaggle_kernel_credentials = KaggleKernelCredentials(target=GcpTarget.CLOUDAI) - - # Patch the 2 GA clients: AutoMlClient and PreditionServiceClient - monkeypatch_client(automl.AutoMlClient, kaggle_kernel_credentials) - monkeypatch_client(automl.PredictionServiceClient, kaggle_kernel_credentials) - - # The AutoML client library exposes 3 different client classes (AutoMlClient, - # TablesClient, PredictionServiceClient), so patch each of them. - # The same KaggleKernelCredentials are passed to all of them. - # The GcsClient class is only used internally by TablesClient. - - # The beta version of the clients that are now GA are included here for now. - # They are deprecated and will be removed by 1 May 2020. - monkeypatch_client(automl_v1beta1.AutoMlClient, kaggle_kernel_credentials) - monkeypatch_client(automl_v1beta1.PredictionServiceClient, kaggle_kernel_credentials) - - # The TablesClient is still in beta, so this will not be deprecated until - # the TablesClient is GA. - monkeypatch_client(automl_v1beta1.TablesClient, kaggle_kernel_credentials) - def init_translation_v2(): from google.cloud import translate_v2 if not is_user_secrets_token_set(): @@ -340,7 +308,7 @@ def init_ucaip(): kaggle_kernel_credentials = KaggleKernelCredentials(target=GcpTarget.CLOUDAI) # Patch the ucaip init method, this flows down to all ucaip services - monkeypatch_init(aiplatform.initializer.global_config, kaggle_kernel_credentials) + monkeypatch_aiplatform_init(aiplatform, kaggle_kernel_credentials) def init_video_intelligence(): from google.cloud import videointelligence @@ -379,7 +347,6 @@ def init_vision(): def init(): init_bigquery() init_gcs() - init_automl() init_translation_v2() init_translation_v3() init_natural_language() @@ -392,4 +359,4 @@ def init(): # google.cloud.* and kaggle_gcp. By calling init here, we guarantee # that regardless of the original import that caused google.cloud.* to be # loaded, the monkeypatching will be done. -init() +init() \ No newline at end of file diff --git a/patches/kaggle_module_resolver.py b/patches/kaggle_module_resolver.py new file mode 100644 index 00000000..430cb980 --- /dev/null +++ b/patches/kaggle_module_resolver.py @@ -0,0 +1,22 @@ +import os +import re +import kagglehub + +from tensorflow_hub import resolver + +short_url_pattern = re.compile(r"https?://([a-z]+\.)?kaggle.com/models/(?P[^\\/]+)/(?P[^\\/]+)/(?P[^\\/]+)/(?P[^\\/]+)/(?P[0-9]+)$") +long_url_pattern = re.compile(r"https?://([a-z]+\.)?kaggle.com/models/(?P[^\\/]+)/(?P[^\\/]+)/frameworks/(?P[^\\/]+)/variations/(?P[^\\/]+)/versions/(?P[0-9]+)$") + +def _is_on_kaggle_notebook(): + return os.getenv("KAGGLE_KERNEL_RUN_TYPE") != None and os.getenv("KAGGLE_USER_SECRETS_TOKEN") != None + +def _is_kaggle_handle(handle): + return long_url_pattern.match(handle) != None or short_url_pattern.match(handle) != None + +class KaggleFileResolver(resolver.HttpResolverBase): + def is_supported(self, handle): + return _is_on_kaggle_notebook() and _is_kaggle_handle(handle) + + def __call__(self, handle): + m = long_url_pattern.match(handle) or short_url_pattern.match(handle) + return kagglehub.model_download(f"{m.group('owner')}/{m.group('model')}/{m.group('framework').lower()}/{m.group('variation')}/{m.group('version')}") diff --git a/patches/kaggle_secrets.py b/patches/kaggle_secrets.py index c1c5dc99..a177c171 100644 --- a/patches/kaggle_secrets.py +++ b/patches/kaggle_secrets.py @@ -94,7 +94,6 @@ def set_gcloud_credentials(self, project=None, account=None): """ creds = self.get_gcloud_credential() creds_path = self._write_credentials_file(creds) - self._write_gsutil_credentials_file(creds) subprocess.run(['gcloud', 'config', 'set', 'auth/credential_file_override', creds_path]) @@ -107,19 +106,11 @@ def set_gcloud_credentials(self, project=None, account=None): subprocess.run(['gcloud', 'config', 'set', 'account', account]) def set_tensorflow_credential(self, credential): - """Sets the credential for use by Tensorflow both in the local notebook - and to pass to the TPU. - """ - # b/159906185: Import tensorflow_gcs_config only when this method is called to prevent preloading TensorFlow. - import tensorflow_gcs_config + """Sets the credential for use by Tensorflow""" - # Write to a local JSON credentials file and set - # GOOGLE_APPLICATION_CREDENTIALS for tensorflow running in the notebook. + # Write to a local JSON credentials file self._write_credentials_file(credential) - # set the credential for the TPU - tensorflow_gcs_config.configure_gcs(credentials=credential) - def get_bigquery_access_token(self) -> Tuple[str, Optional[datetime]]: """Retrieves BigQuery access token information from the UserSecrets service. @@ -139,17 +130,6 @@ def _write_credentials_file(self, credentials) -> str: return adc_path - def _write_gsutil_credentials_file(self, credentials) -> str: - import json - creds_dict = json.loads(credentials) - boto_path = os.path.join(os.environ.get('HOME', '/'), '.boto') - with open(boto_path, 'w') as f: - f.write('[Credentials]\n') - f.write(' gs_oauth2_refresh_token = ') - f.write(creds_dict['refresh_token']) - - return boto_path - def _get_gcs_access_token(self) -> Tuple[str, Optional[datetime]]: return self._get_access_token(GcpTarget.GCS) diff --git a/patches/keras_internal.py b/patches/keras_internal.py new file mode 100644 index 00000000..e28127f9 --- /dev/null +++ b/patches/keras_internal.py @@ -0,0 +1,24 @@ +# Copyright 2021 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Access to Keras function with a different internal and external path.""" + +from tf_keras.src.engine import data_adapter as _data_adapter +from tf_keras.src.models import Functional +from tf_keras.layers import DenseFeatures +from tf_keras.src.utils.dataset_creator import DatasetCreator + + +unpack_x_y_sample_weight = _data_adapter.unpack_x_y_sample_weight +get_data_handler = _data_adapter.get_data_handler diff --git a/patches/keras_internal_test.py b/patches/keras_internal_test.py new file mode 100644 index 00000000..edc33ec2 --- /dev/null +++ b/patches/keras_internal_test.py @@ -0,0 +1,23 @@ +# Copyright 2021 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tensorflow as tf +from tensorflow_decision_forests.keras import keras_internal + + +# Does nothing. Ensures keras_internal can be loaded. + +if __name__ == "__main__": + tf.test.main() + diff --git a/patches/log.py b/patches/log.py deleted file mode 100644 index 88040ba3..00000000 --- a/patches/log.py +++ /dev/null @@ -1,132 +0,0 @@ -import logging -import os - -import google.auth - - -_LOG_TO_FILE_ENV = os.getenv("KAGGLE_LOG_TO_FILE") - - -class _LogFormatter(logging.Formatter): - """A logging formatter which truncates long messages.""" - - _MAX_LOG_LENGTH = 10000 # Be generous, not to truncate long backtraces. - - def format(self, record): - msg = super(_LogFormatter, self).format(record) - return msg[:_LogFormatter._MAX_LOG_LENGTH] if msg else msg - -# TODO(vimota): Clean this up once we're using python 3.8 and can use -# (https://github.com/python/cpython/commit/dde9fdbe453925279ac3d2a6a72102f6f9ef247c) -# Right now, making the logging module display the intended frame's information -# when the logging calls (info, warn, ...) are wrapped (as is the case in our -# Log class) involves fragile logic. -class _Logger(logging.Logger): - - # This is a copy of logging.Logger.findCaller with the filename ignore - # set expanded to include the current filename (".../log.py"). - # Copyright 2001-2015 by Vinay Sajip. All Rights Reserved. - # License: https://github.com/python/cpython/blob/ce9e62544571e7ade7186697d5dd065fb4c5243f/LICENSE - def findCaller(self, stack_info=False): - f = logging.currentframe() - f = f.f_back - rv = "(unknown file)", 0, "(unknown function)", None - while hasattr(f, "f_code"): - co = f.f_code - filename = os.path.normcase(co.co_filename) - if filename in _ignore_srcfiles: - f = f.f_back - continue - sinfo = None - if stack_info: - sio = io.StringIO() - sio.write('Stack (most recent call last):\n') - traceback.print_stack(f, file=sio) - sinfo = sio.getvalue() - if sinfo[-1] == '\n': - sinfo = sinfo[:-1] - sio.close() - rv = (co.co_filename, f.f_lineno, co.co_name, sinfo) - break - return rv - - -_srcfile = os.path.normcase(_Logger.findCaller.__code__.co_filename) -_ignore_srcfiles = (_srcfile, logging._srcfile) - -class Log: - """ Helper aggregate for all things related to logging activity. """ - - _GLOBAL_LOG = logging.getLogger("") - _initialized = False - - # These are convenience helpers. For performance, consider saving Log.get_logger() and using that - @staticmethod - def critical(msg, *args, **kwargs): - Log._GLOBAL_LOG.critical(msg, *args, **kwargs) - - @staticmethod - def fatal(msg, *args, **kwargs): - Log._GLOBAL_LOG.fatal(msg, *args, **kwargs) - - @staticmethod - def exception(msg, *args, **kwargs): - Log._GLOBAL_LOG.exception(msg, *args, **kwargs) - - @staticmethod - def error(msg, *args, **kwargs): - Log._GLOBAL_LOG.error(msg, *args, **kwargs) - - @staticmethod - def warn(msg, *args, **kwargs): - Log._GLOBAL_LOG.warn(msg, *args, **kwargs) - - @staticmethod - def warning(msg, *args, **kwargs): - Log._GLOBAL_LOG.warning(msg, *args, **kwargs) - - @staticmethod - def debug(msg, *args, **kwargs): - Log._GLOBAL_LOG.debug(msg, *args, **kwargs) - - @staticmethod - def info(msg, *args, **kwargs): - Log._GLOBAL_LOG.info(msg, *args, **kwargs) - - @staticmethod - def set_level(loglevel): - if isinstance(loglevel, int): - Log._GLOBAL_LOG.setLevel(loglevel) - return - elif isinstance(loglevel, str): - # idea from https://docs.python.org/3.5/howto/logging.html#logging-to-a-file - numeric_level = getattr(logging, loglevel.upper(), None) - if isinstance(numeric_level, int): - Log._GLOBAL_LOG.setLevel(numeric_level) - return - - raise ValueError('Invalid log level: %s' % loglevel) - - @staticmethod - def _static_init(): - if Log._initialized: - return - - logging.setLoggerClass(_Logger) - # The root logger's type is unfortunately (and surprisingly) not affected by - # `setLoggerClass`. Monkey patch it instead. TODO(vimota): Remove this, see the TODO - # associated with _Logger. - logging.RootLogger.findCaller = _Logger.findCaller - log_to_file = _LOG_TO_FILE_ENV.lower() in ("yes", "true", "t", "1") if _LOG_TO_FILE_ENV is not None else True - if log_to_file: - handler = logging.FileHandler(filename='/tmp/kaggle.log', mode='w') - else: - handler = logging.StreamHandler() - - # ".1s" is for the first letter: http://stackoverflow.com/a/27453084/1869. - format_string = "%(asctime)s %(levelname).1s %(process)d %(filename)s:%(lineno)d] %(message)s" - handler.setFormatter(_LogFormatter(format_string)) - logging.basicConfig(level=logging.INFO, handlers=[handler]) - Log._initialized = True - -Log._static_init() \ No newline at end of file diff --git a/patches/sitecustomize.py b/patches/sitecustomize.py index d01f3845..1bb8a1b6 100644 --- a/patches/sitecustomize.py +++ b/patches/sitecustomize.py @@ -1,16 +1,17 @@ +import logging import os -from log import Log - import sys +import importlib.abc import importlib import importlib.machinery +import wrapt + class GcpModuleFinder(importlib.abc.MetaPathFinder): _MODULES = [ 'google.cloud.bigquery', 'google.cloud.storage', - 'google.cloud.automl_v1beta1', 'google.cloud.translate', 'google.cloud.translate_v2', 'google.cloud.translate_v3', @@ -53,7 +54,6 @@ def create_module(self, spec): _LOADERS = { 'google.cloud.bigquery': kaggle_gcp.init_bigquery, 'google.cloud.storage': kaggle_gcp.init_gcs, - 'google.cloud.automl_v1beta1': kaggle_gcp.init_automl, 'google.cloud.translate': kaggle_gcp.init_translation_v3, 'google.cloud.translate_v2': kaggle_gcp.init_translation_v2, 'google.cloud.translate_v3': kaggle_gcp.init_translation_v3, @@ -72,3 +72,72 @@ def exec_module(self, module): if not hasattr(sys, 'frozen'): sys.meta_path.insert(0, GcpModuleFinder()) + +@wrapt.when_imported('google.generativeai') +def post_import_logic(module): + if os.getenv('KAGGLE_DISABLE_GOOGLE_GENERATIVE_AI_INTEGRATION') != None: + return + if (os.getenv('KAGGLE_DATA_PROXY_TOKEN') == None or + os.getenv('KAGGLE_USER_SECRETS_TOKEN') == None or + (os.getenv('KAGGLE_DATA_PROXY_URL') == None and + os.getenv('KAGGLE_GRPC_DATA_PROXY_URL') == None)): + return + + old_configure = module.configure + + def new_configure(*args, **kwargs): + if ('default_metadata' in kwargs): + default_metadata = kwargs['default_metadata'] + else: + default_metadata = [] + default_metadata.append(("x-kaggle-proxy-data", os.environ['KAGGLE_DATA_PROXY_TOKEN'])) + user_secrets_token = os.environ['KAGGLE_USER_SECRETS_TOKEN'] + default_metadata.append(('x-kaggle-authorization', f'Bearer {user_secrets_token}')) + kwargs['default_metadata'] = default_metadata + + if ('client_options' in kwargs): + client_options = kwargs['client_options'] + else: + client_options = {} + + if os.getenv('KAGGLE_GOOGLE_GENERATIVE_AI_USE_REST_ONLY') != None: + kwargs['transport'] = 'rest' + + if 'transport' in kwargs and kwargs['transport'] == 'rest': + client_options['api_endpoint'] = os.environ['KAGGLE_DATA_PROXY_URL'] + client_options['api_endpoint'] += '/palmapi' + else: + client_options['api_endpoint'] = os.environ['KAGGLE_GRPC_DATA_PROXY_URL'] + kwargs['client_options'] = client_options + + old_configure(*args, **kwargs) + + module.configure = new_configure + module.configure() # generativeai can use GOOGLE_API_KEY env variable, so make sure we have the other configs set + +@wrapt.when_imported('google.genai') +def post_genai_import_logic(module): + if os.getenv('KAGGLE_DISABLE_GOOGLE_GENERATIVE_AI_INTEGRATION'): + return + + if not (os.getenv('KAGGLE_DATA_PROXY_TOKEN') and + os.getenv('KAGGLE_USER_SECRETS_TOKEN') and + os.getenv('KAGGLE_DATA_PROXY_URL')): + return + @wrapt.patch_function_wrapper(module, 'Client.__init__') + def init_wrapper(wrapped, instance, args, kwargs): + # Don't want to forward requests that are to Vertex AI, debug mode, or have their own http_options specified + # Thus, if the client constructor contains any params other than api_key, we don't set up forwarding + if any(value is not None for key, value in kwargs.items() if key != 'api_key'): + return wrapped(*args, **kwargs) + + default_metadata = { + "x-kaggle-proxy-data": os.environ['KAGGLE_DATA_PROXY_TOKEN'], + 'x-kaggle-authorization': f"Bearer {os.environ['KAGGLE_USER_SECRETS_TOKEN']}" + } + http_options = { + 'base_url': os.getenv('KAGGLE_DATA_PROXY_URL') + '/palmapi/', + 'headers': default_metadata + } + kwargs['http_options'] = http_options + return wrapped(*args, **kwargs) diff --git a/push b/push index cdd53389..124a3469 100755 --- a/push +++ b/push @@ -8,6 +8,7 @@ Push a newly-built image with the given LABEL to gcr.io and DockerHub. Options: -g, --gpu Push the image with GPU support. + -t, --tpu Push the image with GPU support. -s, --source-image IMAGE Tag for the source image. EOF } @@ -26,6 +27,10 @@ while :; do SOURCE_IMAGE_TAG='kaggle/python-gpu-build:latest' TARGET_IMAGE='gcr.io/kaggle-private-byod/python' ;; + -t|--tpu) + SOURCE_IMAGE_TAG='kaggle/python-tpuvm-build:latest' + TARGET_IMAGE='gcr.io/kaggle-private-byod/python-tpuvm' + ;; -s|--source-image) if [[ -z $2 ]]; then usage diff --git a/renderizer/Dockerfile b/renderizer/Dockerfile new file mode 100644 index 00000000..9faac229 --- /dev/null +++ b/renderizer/Dockerfile @@ -0,0 +1,12 @@ +# Image used to generate the Dockerfiles from a Go text template. +# +# Build: +# docker build --rm --pull -t gcr.io/kaggle-images/go-renderizer -f Dockerfile . +# +# Push: +# docker push gcr.io/kaggle-images/go-renderizer +FROM golang:1.17 + +RUN go install github.com/gomatic/renderizer/v2/cmd/renderizer@v2.0.13 + +ENTRYPOINT ["renderizer"] \ No newline at end of file diff --git a/tensorflow-whl/CHANGELOG.md b/tensorflow-whl/CHANGELOG.md deleted file mode 100644 index 1dfad51e..00000000 --- a/tensorflow-whl/CHANGELOG.md +++ /dev/null @@ -1,26 +0,0 @@ -* `1.11.0-py36`: TensorFlow 1.11.0 wheels built with python 3.6 -* `1.12.0-py36`: TensorFlow 1.12.0 wheels with CUDA 9.2 -* `1.13.1-py36`: TensorFlow 1.13.1 wheels with CUDA 10.0 -* `1.13.1-py36-2`: TensorFlow 1.13.1 wheels with CUDA 10.0 and bump anaconda3 base image version to 5.3.0 -* `1.13.1-py37`: TensorFlow 1.13.1 with Python 3.7.0 and bump anaconda3 base image version to 5.3.0 -* `1.13.1-py37-2`: TensorFlow 1.13.1 with Python 3.7.3 -* `1.14.0-py36`: TensorFlow 1.14.0 with Python 3.6 -* `2.0.0-rc1-py36`: TensorFlow 2.0.0 RC1 with Python 3.6 -* `2.0.0-py36`: TensorFlow 2.0.0 with Python 3.6 -* `2.1.0-rc0-py36`: TensorFlow 2.1.0-rc0 with Python 3.6 -* `2.1.0-rc2-py36`: TensorFlow 2.1.0-rc2 with Python 3.6 -* `2.1.0-py36`: TensorFlow 2.1.0 with Python 3.6 -* `2.1.0-py36-2`: TensorFlow 2.1.0 with CUDA 10.1 -* `2.1.0-py37`: TensorFlow 2.1.0 with Python 3.7 -* `2.1.0-py37-2`: TensorFlow 2.1.0 with Python 3.7 & DLVM base image -* `2.1.0-py37-3`: TensorFlow 2.1.0 with Python 3.7, DLVM base image, tensorflow-gcs-config -* `2.2.0-py37`: TensorFlow 2.2.0 with Python 3.7 -* `2.2.0-py37-2`: TensorFlow 2.2.0 with Python 3.7 & TFA -* `2.3.0-py37`: TensorFlow 2.3.0 with Python 3.7 -* `2.3.1-py37`: TensorFlow 2.3.1 with Python 3.7 -* `2.3.1-py37-2`: TensorFlow 2.3.1 & TFA 0.11 with Python 3.7 -* `2.4.0-py37`: TensorFlow 2.4.0 & Python 3.7 & TFA 0.12 -* `2.4.0-py37-2`: TensorFlow 2.4.0 & m61 base image -* `2.4.0-py37-3`: TensorFlow 2.4.0 & 7.0 CUDA compute capability -* `2.4.1-py37`: TensorFlow 2.4.1 & TFA 0.12.1 -* `2.4.1-py37-2`: TensorFlow 2.4.1 & CUDA 11.0 \ No newline at end of file diff --git a/tensorflow-whl/Dockerfile b/tensorflow-whl/Dockerfile deleted file mode 100644 index e49314af..00000000 --- a/tensorflow-whl/Dockerfile +++ /dev/null @@ -1,131 +0,0 @@ -FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04 AS nvidia -FROM gcr.io/deeplearning-platform-release/base-cpu:m66 - -# Avoid interactive configuration prompts/dialogs during apt-get. -ENV DEBIAN_FRONTEND=noninteractive - -# This is necessary to for apt to access HTTPS sources -RUN apt-get update && \ - apt-get install apt-transport-https - -# Cuda support -COPY --from=nvidia /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/ -COPY --from=nvidia /etc/apt/sources.list.d/nvidia-ml.list /etc/apt/sources.list.d/ -COPY --from=nvidia /etc/apt/trusted.gpg /etc/apt/trusted.gpg.d/cuda.gpg -# See b/142337634#comment28 -RUN sed -i 's/deb https:\/\/developer.download.nvidia.com/deb http:\/\/developer.download.nvidia.com/' /etc/apt/sources.list.d/*.list - -# Ensure the cuda libraries are compatible with the GPU image. -# TODO(b/120050292): Use templating to keep in sync. -ENV CUDA_MAJOR_VERSION=11 -ENV CUDA_MINOR_VERSION=0 -ENV CUDA_VERSION=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION -LABEL com.nvidia.volumes.needed="nvidia_driver" -LABEL com.nvidia.cuda.version="${CUDA_VERSION}" -ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} -# The stub is useful to us both for built-time linking and run-time linking, on CPU-only systems. -# When intended to be used with actual GPUs, make sure to (besides providing access to the host -# CUDA user libraries, either manually or through the use of nvidia-docker) exclude them. One -# convenient way to do so is to obscure its contents by a bind mount: -# docker run .... -v /non-existing-directory:/usr/local/cuda/lib64/stubs:ro ... -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH" -ENV NVIDIA_VISIBLE_DEVICES=all -ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility -ENV NVIDIA_REQUIRE_CUDA="cuda>=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION" -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-cupti-$CUDA_VERSION \ - cuda-cudart-$CUDA_VERSION \ - cuda-cudart-dev-$CUDA_VERSION \ - cuda-libraries-$CUDA_VERSION \ - cuda-libraries-dev-$CUDA_VERSION \ - cuda-nvml-dev-$CUDA_VERSION \ - cuda-minimal-build-$CUDA_VERSION \ - cuda-command-line-tools-$CUDA_VERSION \ - libcudnn8=8.0.4.30-1+cuda$CUDA_VERSION \ - libcudnn8-dev=8.0.4.30-1+cuda$CUDA_VERSION \ - libnccl2=2.7.8-1+cuda$CUDA_VERSION \ - libnccl-dev=2.7.8-1+cuda$CUDA_VERSION && \ - ln -s /usr/local/cuda-$CUDA_VERSION /usr/local/cuda && \ - ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 - -# Use Bazelisk to ensure the proper bazel version is used. -RUN cd /usr/local/src && \ - wget --no-verbose "https://github.com/bazelbuild/bazelisk/releases/download/v1.7.4/bazelisk-linux-amd64" && \ - mv bazelisk-linux-amd64 /usr/local/bin/bazel && \ - chmod u+x /usr/local/bin/bazel - -# Fetch TensorFlow & install dependencies. -RUN cd /usr/local/src && \ - git clone https://github.com/tensorflow/tensorflow && \ - cd tensorflow && \ - git checkout tags/v2.4.1 && \ - pip install keras_applications --no-deps && \ - pip install keras_preprocessing --no-deps - -# Create a TensorFlow wheel for CPU -RUN cd /usr/local/src/tensorflow && \ - cat /dev/null | ./configure && \ - bazel build --config=opt \ - --config=v2 \ - --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \ - //tensorflow/tools/pip_package:build_pip_package && \ - bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_cpu && \ - bazel clean - -# Install TensorFlow CPU wheel which is required to build the GCS & addons libraries. -RUN pip install /tmp/tensorflow_cpu/tensorflow*.whl - -# Build TensorFlow addons library against TensorFlow CPU. -RUN cd /usr/local/src/ && \ - git clone https://github.com/tensorflow/addons && \ - cd addons && \ - git checkout tags/v0.12.1 && \ - python ./configure.py && \ - bazel build --enable_runfiles build_pip_pkg && \ - bazel-bin/build_pip_pkg /tmp/tfa_cpu && \ - bazel clean - -# Build tensorflow_gcs_config library against TensorFlow CPU. -ADD tensorflow-gcs-config /usr/local/src/tensorflow_gcs_config/ -RUN cd /usr/local/src/tensorflow_gcs_config && \ - apt-get install -y libcurl4-openssl-dev && \ - python setup.py bdist_wheel -d /tmp/tensorflow_gcs_config && \ - bazel clean - -# Create a tensorflow wheel for GPU/cuda -ENV TF_NEED_CUDA=1 -ENV TF_CUDA_VERSION=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION -# 3.7 (K80), 6.0 (P100), 7.0 (V100), 7.5 (T4): https://developer.nvidia.com/cuda-gpus -ENV TF_CUDA_COMPUTE_CAPABILITIES=3.7,6.0,7.0,7.5 -ENV TF_CUDNN_VERSION=8 -ENV TF_NCCL_VERSION=2 -ENV NCCL_INSTALL_PATH=/usr/ - -RUN cd /usr/local/src/tensorflow && \ - # TF_NCCL_INSTALL_PATH is used for both libnccl.so.2 and libnccl.h. Make sure they are both accessible from the same directory. - ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/ && \ - cat /dev/null | ./configure && \ - echo "/usr/local/cuda-${TF_CUDA_VERSION}/targets/x86_64-linux/lib/stubs" > /etc/ld.so.conf.d/cuda-stubs.conf && ldconfig && \ - bazel build --config=opt \ - --config=v2 \ - --config=cuda \ - --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \ - //tensorflow/tools/pip_package:build_pip_package && \ - rm /etc/ld.so.conf.d/cuda-stubs.conf && ldconfig && \ - bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_gpu && \ - bazel clean - -# Install TensorFlow GPU wheel which to build addons against. -RUN pip install /tmp/tensorflow_gpu/tensorflow*.whl - -# Build TensorFlow addons library against TensorFlow GPU. -ENV CUDA_TOOLKIT_PATH=/usr/local/cuda -ENV CUDNN_INSTALL_PATH=/usr/lib/x86_64-linux-gnu -RUN cd /usr/local/src/addons && \ - python ./configure.py && \ - bazel build --enable_runfiles build_pip_pkg && \ - bazel-bin/build_pip_pkg /tmp/tfa_gpu && \ - bazel clean - -# Print out the built .whl files -RUN ls -R /tmp/tensorflow* diff --git a/tensorflow-whl/Jenkinsfile b/tensorflow-whl/Jenkinsfile deleted file mode 100644 index 9a9cbd60..00000000 --- a/tensorflow-whl/Jenkinsfile +++ /dev/null @@ -1,36 +0,0 @@ -pipeline { - agent { label 'ephemeral-linux-gpu' } - options { - // The Build GPU stage depends on the image from the Push CPU stage - disableConcurrentBuilds() - } - environment { - GIT_COMMIT_SHORT = sh(returnStdout: true, script:"git rev-parse --short=7 HEAD").trim() - GIT_COMMIT_SUBJECT = sh(returnStdout: true, script:"git log --format=%s -n 1 HEAD").trim() - GIT_COMMIT_AUTHOR = sh(returnStdout: true, script:"git log --format='%an' -n 1 HEAD").trim() - GIT_COMMIT_SUMMARY = "`` ${GIT_COMMIT_SUBJECT} - ${GIT_COMMIT_AUTHOR}" - } - - stages { - stage('Build') { - steps { - sh '''#!/bin/bash - set -exo pipefail - - cd tensorflow-whl/ - ./build | ts - ''' - } - } - stage('Push') { - steps { - sh '''#!/bin/bash - set -exo pipefail - - cd tensorflow-whl/ - ./push ${GIT_BRANCH}-staging - ''' - } - } - } -} diff --git a/tensorflow-whl/README.md b/tensorflow-whl/README.md deleted file mode 100644 index 02c74d14..00000000 --- a/tensorflow-whl/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# Build new Tensorflow wheels - -``` -./build -``` - -# Push the new wheels (Kaggle Engineers only) - -1. Add an entry in the [CHANGELOG](CHANGELOG.md) with an appropriate `LABEL`. -2. Push the new image using the `LABEL` you picked above. - - ``` - ./push LABEL - ``` - -# Use the new wheels - -Update the line below in the [CPU Dockerfile](../Dockerfile) and the [GPU Dockerfile](../gpu.Dockerfile) to use the new `LABEL`. - -To use wheels built locally: -``` -FROM kaggle/python-tensorflow-whl as tensorflow_whl -``` - -To use our pre-built wheels: -``` -FROM gcr.io/kaggle-images/python-tensorflow-whl: