Merge pull request #199 from Kaggle/upgrade-cuda-11.8

djherbis · web-flow · commit 0441e269657e · 2023-05-11T23:52:26.000-04:00
Upgrade to CUDA 11.7
diff --git a/Dockerfile b/Dockerfile
@@ -28,7 +28,8 @@ RUN apt-get update && \
     touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \
     # papermill can replace nbconvert for executing notebooks
     pip install papermill && \
-    pip install jupyterlab-lsp && \
+    # b/276358430 fix Jupyter lsp freezing up the jupyter server
+    pip install jupyterlab-lsp "jupyter-lsp==1.5.1" && \
     /tmp/clean-layer.sh
 
 # Miniconda
diff --git a/gpu.Dockerfile b/gpu.Dockerfile
@@ -1,5 +1,5 @@
 ARG BASE_TAG=staging
-FROM nvidia/cuda:11.4.2-cudnn8-devel-ubuntu18.04 AS nvidia
+FROM nvidia/cuda:11.7.0-cudnn8-devel-ubuntu18.04 AS nvidia
 FROM gcr.io/kaggle-images/rstats:${BASE_TAG}
 ARG ncpus=1
 
@@ -10,11 +10,12 @@ COPY --from=nvidia /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/
 COPY --from=nvidia /etc/apt/trusted.gpg /etc/apt/trusted.gpg.d/cuda.gpg
 
 ENV CUDA_MAJOR_VERSION=11
-ENV CUDA_MINOR_VERSION=4
-ENV CUDA_PATCH_VERSION=2
+ENV CUDA_MINOR_VERSION=7
+ENV CUDA_PATCH_VERSION=0
 ENV CUDA_VERSION=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.$CUDA_PATCH_VERSION
 ENV CUDA_PKG_VERSION=$CUDA_MAJOR_VERSION-$CUDA_MINOR_VERSION
-ENV CUDNN_VERSION=8.2.4.15
+ENV CUDNN_VERSION=8.5.0.96
+ENV NCCL_VERSION=2.13.4-1
 LABEL com.nvidia.volumes.needed="nvidia_driver"
 LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
 LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
@@ -41,8 +42,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
       libcudnn8-dev=$CUDNN_VERSION-1+cuda$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION \
       libcublas-$CUDA_PKG_VERSION \
       libcublas-dev-$CUDA_PKG_VERSION \
-      libnccl2=2.11.4-1+cuda$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION \
-      libnccl-dev=2.11.4-1+cuda$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \
+      libnccl2=$NCCL_VERSION+cuda$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION \
+      libnccl-dev=$NCCL_VERSION+cuda$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \
     /tmp/clean-layer.sh
 
 ENV CUDA_HOME=/usr/local/cuda
@@ -55,7 +56,7 @@ ENV CUDA_HOME=/usr/local/cuda
 ADD ldpaths $R_HOME/etc/ldpaths
 
 # Install tensorflow with GPU support
-RUN R -e 'keras::install_keras(tensorflow = "2.6-gpu")' && \
+RUN R -e 'keras::install_keras(tensorflow = "gpu")' && \
     rm -rf /tmp/tensorflow_gpu && \
     /tmp/clean-layer.sh
 
@@ -70,8 +71,8 @@ RUN CPATH=/usr/local/cuda/targets/x86_64-linux/include install2.r --error --ncpu
 
 # Torch: install the full package upfront otherwise it will be installed on loading the package which doesn't work for kernels
 # without internet (competitions for example). It will detect CUDA and install the proper version.
-# Make Torch think we use CUDA 11.3 (https://github.com/mlverse/torch/issues/807)
-ENV CUDA=11.3
+# Make Torch think we use CUDA 11.8 (https://github.com/mlverse/torch/issues/807)
+ENV CUDA=11.7
 RUN R -e 'install.packages("torch")'
 RUN R -e 'library(torch); install_torch()'
 
diff --git a/test b/test
@@ -76,6 +76,7 @@ mkdir -p /tmp/rstats-build/working
 # Check that Jupyter server can run; if it dies on startup, the `docker kill` command will throw an error
 docker run -d --name=jupyter_test_r --read-only --net=none \
     -e HOME=/tmp \
+    -e NVIDIA_DISABLE_REQUIRE=1 \
     -v $PWD:/input:ro -v /tmp/rstats-build/working:/working \
     -v /tmp/rstats-build/tmp:/tmp -v /tmp/rstats-build/devshm:/dev/shm \
     -w=/working \
@@ -84,7 +85,7 @@ sleep 3
 docker kill jupyter_test_r && docker rm jupyter_test_r
 
 # Check that papermill is installed in python (b/191304257).
-docker run --rm --name=papermill_test_r --read-only --net=none \
+docker run --rm -e NVIDIA_DISABLE_REQUIRE=1 --name=papermill_test_r --read-only --net=none \
     "$IMAGE_TAG" python -c 'import sys;import papermill as pm; print(pm.__version__)'
 
 
@@ -93,6 +94,7 @@ docker run --rm --name=papermill_test_r --read-only --net=none \
 docker run --rm -t --net=none \
     -e HOME=/tmp \
     -e TF_FORCE_GPU_ALLOW_GROWTH=true \
+    -e NVIDIA_DISABLE_REQUIRE=1 \
     -v $PWD:/input:ro -v /tmp/rstats-build/working:/working \
     -v /tmp/rstats-build/tmp:/tmp -v /tmp/rstats-build/devshm:/dev/shm \
     -w=/working \
diff --git a/tests/test_keras.R b/tests/test_keras.R
@@ -12,9 +12,11 @@ test_that("model training", {
         layer_dropout(rate=0.4) %>%
         layer_dense(unit=3, activation='softmax')
 
+    optimizers <- keras::keras$optimizers
+
     model %>% compile(
         loss = 'categorical_crossentropy',
-        optimizer = optimizer_rmsprop(),
+        optimizer = optimizers$RMSprop(),
         metrics = c('accuracy')
     )
 
@@ -82,7 +84,7 @@ test_that("CNN model training", {
     datagen %>% fit_image_data_generator(train.feature)
 
     history <- model %>%
-        fit_generator(
+        fit(
             flow_images_from_data(train.feature, train.label, datagen, batch_size = 10),
             steps_per_epoch = nrow(train.feature) / 10,
             epochs = 1)
@@ -108,5 +110,6 @@ test_that("flow_images_from_dataframe", {
         class_mode = NULL,
         target_size = c(224, 224))
 
-    expect_is(pred, "keras_preprocessing.image.dataframe_iterator.DataFrameIterator")
+    batch <- generator_next(pred, completed = NULL)
+    expect_gt(length(batch), 0)
 })