From 3d2d1196aab9863b7222456df73c044e5a8a5643 Mon Sep 17 00:00:00 2001
From: Peter Amstutz <peter.amstutz@curii.com>
Date: Thu, 24 Feb 2022 03:58:55 -0500
Subject: [PATCH] Cuda request gpu device expr (#1629)

* Allow expressions to dynamically request min/max number of GPUs.

* Add test coverage for CUDA checks
---
 cwltool/cuda.py                   |  10 +--
 cwltool/docker.py                 |   9 +-
 cwltool/executors.py              |   3 +
 cwltool/extensions-v1.1.yml       |  34 ++++++--
 cwltool/extensions.yml            |  34 ++++++--
 cwltool/job.py                    |   5 +-
 cwltool/process.py                |  32 +++++--
 cwltool/singularity.py            |   6 +-
 mypy-requirements.txt             |   1 +
 tests/test_cuda.py                | 136 +++++++++++++++++++++++++++++-
 tests/wf/nvidia-smi-cc.cwl        |  17 ++++
 tests/wf/nvidia-smi-container.cwl |   2 +-
 tests/wf/nvidia-smi-max.cwl       |  14 +++
 tests/wf/nvidia-smi-range.cwl     |  15 ++++
 tests/wf/nvidia-smi.cwl           |   8 +-
 15 files changed, 279 insertions(+), 47 deletions(-)
 create mode 100644 tests/wf/nvidia-smi-cc.cwl
 create mode 100644 tests/wf/nvidia-smi-max.cwl
 create mode 100644 tests/wf/nvidia-smi-range.cwl

diff --git a/cwltool/cuda.py b/cwltool/cuda.py
index 8118428f1..65dc19c10 100644
--- a/cwltool/cuda.py
+++ b/cwltool/cuda.py
@@ -18,7 +18,7 @@ def cuda_version_and_device_count() -> Tuple[str, int]:
     return (cv.data, int(ag.data))
 
 
-def cuda_check(cuda_req: CWLObjectType) -> int:
+def cuda_check(cuda_req: CWLObjectType, requestCount: int) -> int:
     try:
         vmin = float(str(cuda_req["cudaVersionMin"]))
         version, devices = cuda_version_and_device_count()
@@ -31,14 +31,12 @@ def cuda_check(cuda_req: CWLObjectType) -> int:
                 "CUDA version '%s' is less than minimum version '%s'", version, vmin
             )
             return 0
-        dmin = cast(int, cuda_req.get("deviceCountMin", 1))
-        dmax = cast(int, cuda_req.get("deviceCountMax", dmin))
-        if devices < dmin:
+        if requestCount > devices:
             _logger.warning(
-                "Requested at least %d GPU devices but only %d available", dmin, devices
+                "Requested %d GPU devices but only %d available", requestCount, devices
             )
             return 0
-        return min(dmax, devices)
+        return requestCount
     except Exception as e:
         _logger.warning("Error checking CUDA requirements: %s", e)
         return 0
diff --git a/cwltool/docker.py b/cwltool/docker.py
index 2784ef593..69f194cca 100644
--- a/cwltool/docker.py
+++ b/cwltool/docker.py
@@ -397,13 +397,8 @@ def create_runtime(
         if runtimeContext.rm_container:
             runtime.append("--rm")
 
-        cuda_req, _ = self.builder.get_requirement(
-            "http://commonwl.org/cwltool#CUDARequirement"
-        )
-        if cuda_req:
-            # Checked earlier that the device count is non-zero in _setup
-            count = cuda_check(cuda_req)
-            runtime.append("--gpus=" + str(count))
+        if self.builder.resources.get("cudaDeviceCount"):
+            runtime.append("--gpus=" + str(self.builder.resources["cudaDeviceCount"]))
 
         cidfile_path = None  # type: Optional[str]
         # add parameters to docker to write a container ID file
diff --git a/cwltool/executors.py b/cwltool/executors.py
index ab1a19602..32d3e4fe0 100644
--- a/cwltool/executors.py
+++ b/cwltool/executors.py
@@ -305,6 +305,9 @@ def select_resources(
         result["tmpdirSize"] = math.ceil(request["tmpdirMin"])
         result["outdirSize"] = math.ceil(request["outdirMin"])
 
+        if "cudaDeviceCount" in request:
+            result["cudaDeviceCount"] = request["cudaDeviceCount"]
+
         return result
 
     def _runner(self, job, runtime_context, TMPDIR_LOCK):
diff --git a/cwltool/extensions-v1.1.yml b/cwltool/extensions-v1.1.yml
index 81c24983e..603c40f05 100644
--- a/cwltool/extensions-v1.1.yml
+++ b/cwltool/extensions-v1.1.yml
@@ -93,13 +93,29 @@ $graph:
 
         See https://docs.nvidia.com/deploy/cuda-compatibility/ for
         details.
-    cudaComputeCapabilityMin:
-      type: string
-      doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
-    deviceCountMin:
-      type: int?
+    cudaComputeCapability:
+      type:
+        - 'string'
+        - 'string[]'
+      doc: |
+        CUDA hardware capability required to run the software, in X.Y
+        format.
+
+        * If this is a single value, it defines only the minimum
+          compute capability.  GPUs with higher capability are also
+          accepted.
+
+        * If it is an array value, then only select GPUs with compute
+          capabilities that explicitly appear in the array.
+    cudaDeviceCountMin:
+      type: ['null', int, cwl:Expression]
       default: 1
-      doc: Minimum number of GPU devices to request, default 1.
-    deviceCountMax:
-      type: int?
-      doc: Maximum number of GPU devices to request.  If not specified, same as `deviceCountMin`.
+      doc: |
+        Minimum number of GPU devices to request.  If not specified,
+        same as `cudaDeviceCountMax`.  If neither are specified,
+        default 1.
+    cudaDeviceCountMax:
+      type: ['null', int, cwl:Expression]
+      doc: |
+        Maximum number of GPU devices to request.  If not specified,
+        same as `cudaDeviceCountMin`.
diff --git a/cwltool/extensions.yml b/cwltool/extensions.yml
index 17260ee5e..6c36760fa 100644
--- a/cwltool/extensions.yml
+++ b/cwltool/extensions.yml
@@ -203,13 +203,29 @@ $graph:
 
         See https://docs.nvidia.com/deploy/cuda-compatibility/ for
         details.
-    cudaComputeCapabilityMin:
-      type: string
-      doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
-    deviceCountMin:
-      type: int?
+    cudaComputeCapability:
+      type:
+        - 'string'
+        - 'string[]'
+      doc: |
+        CUDA hardware capability required to run the software, in X.Y
+        format.
+
+        * If this is a single value, it defines only the minimum
+          compute capability.  GPUs with higher capability are also
+          accepted.
+
+        * If it is an array value, then only select GPUs with compute
+          capabilities that explicitly appear in the array.
+    cudaDeviceCountMin:
+      type: ['null', int, cwl:Expression]
       default: 1
-      doc: Minimum number of GPU devices to request, default 1.
-    deviceCountMax:
-      type: int?
-      doc: Maximum number of GPU devices to request.  If not specified, same as `deviceCountMin`.
+      doc: |
+        Minimum number of GPU devices to request.  If not specified,
+        same as `cudaDeviceCountMax`.  If neither are specified,
+        default 1.
+    cudaDeviceCountMax:
+      type: ['null', int, cwl:Expression]
+      doc: |
+        Maximum number of GPU devices to request.  If not specified,
+        same as `cudaDeviceCountMin`.
diff --git a/cwltool/job.py b/cwltool/job.py
index 38a928929..c360dac84 100644
--- a/cwltool/job.py
+++ b/cwltool/job.py
@@ -2,6 +2,7 @@
 import functools
 import itertools
 import logging
+import math
 import os
 import re
 import shutil
@@ -180,7 +181,9 @@ def _setup(self, runtimeContext: RuntimeContext) -> None:
             "http://commonwl.org/cwltool#CUDARequirement"
         )
         if cuda_req:
-            count = cuda_check(cuda_req)
+            count = cuda_check(
+                cuda_req, math.ceil(self.builder.resources["cudaDeviceCount"])
+            )
             if count == 0:
                 raise WorkflowException("Could not satisfy CUDARequirement")
 
diff --git a/cwltool/process.py b/cwltool/process.py
index d6975e5ce..2c7ce6af9 100644
--- a/cwltool/process.py
+++ b/cwltool/process.py
@@ -980,6 +980,7 @@ def evalResources(
         resourceReq, _ = self.get_requirement("ResourceRequirement")
         if resourceReq is None:
             resourceReq = {}
+
         cwl_version = self.metadata.get(ORIGINAL_CWLVERSION, None)
         if cwl_version == "v1.0":
             ram = 1024
@@ -995,20 +996,34 @@ def evalResources(
             "outdirMin": 1024,
             "outdirMax": 1024,
         }
-        for a in ("cores", "ram", "tmpdir", "outdir"):
+
+        cudaReq, _ = self.get_requirement("http://commonwl.org/cwltool#CUDARequirement")
+        if cudaReq:
+            request["cudaDeviceCountMin"] = 1
+            request["cudaDeviceCountMax"] = 1
+
+        for rsc, a in (
+            (resourceReq, "cores"),
+            (resourceReq, "ram"),
+            (resourceReq, "tmpdir"),
+            (resourceReq, "outdir"),
+            (cudaReq, "cudaDeviceCount"),
+        ):
+            if rsc is None:
+                continue
             mn = mx = None  # type: Optional[Union[int, float]]
-            if resourceReq.get(a + "Min"):
+            if rsc.get(a + "Min"):
                 mn = cast(
                     Union[int, float],
                     eval_resource(
-                        builder, cast(Union[str, int, float], resourceReq[a + "Min"])
+                        builder, cast(Union[str, int, float], rsc[a + "Min"])
                     ),
                 )
-            if resourceReq.get(a + "Max"):
+            if rsc.get(a + "Max"):
                 mx = cast(
                     Union[int, float],
                     eval_resource(
-                        builder, cast(Union[str, int, float], resourceReq[a + "Max"])
+                        builder, cast(Union[str, int, float], rsc[a + "Max"])
                     ),
                 )
             if mn is None:
@@ -1022,13 +1037,18 @@ def evalResources(
 
         request_evaluated = cast(Dict[str, Union[int, float]], request)
         if runtimeContext.select_resources is not None:
+            # Call select resources hook
             return runtimeContext.select_resources(request_evaluated, runtimeContext)
-        return {
+
+        defaultReq = {
             "cores": request_evaluated["coresMin"],
             "ram": math.ceil(request_evaluated["ramMin"]),
             "tmpdirSize": math.ceil(request_evaluated["tmpdirMin"]),
             "outdirSize": math.ceil(request_evaluated["outdirMin"]),
         }
+        if cudaReq:
+            defaultReq["cudaDeviceCount"] = request_evaluated["cudaDeviceCountMin"]
+        return defaultReq
 
     def validate_hints(
         self, avsc_names: Names, hints: List[CWLObjectType], strict: bool
diff --git a/cwltool/singularity.py b/cwltool/singularity.py
index 7d4d5b66a..4d9cacca8 100644
--- a/cwltool/singularity.py
+++ b/cwltool/singularity.py
@@ -434,11 +434,7 @@ def create_runtime(
         else:
             runtime.extend(["--net", "--network", "none"])
 
-        cuda_req, _ = self.builder.get_requirement(
-            "http://commonwl.org/cwltool#CUDARequirement"
-        )
-        if cuda_req:
-            # Checked earlier that the device count is non-zero in _setup
+        if self.builder.resources.get("cudaDeviceCount"):
             runtime.append("--nv")
 
         for name, value in self.environment.items():
diff --git a/mypy-requirements.txt b/mypy-requirements.txt
index c3461b0a5..23611340f 100644
--- a/mypy-requirements.txt
+++ b/mypy-requirements.txt
@@ -2,3 +2,4 @@ mypy==0.931
 types-requests
 types-setuptools
 types-psutil
+types-mock
diff --git a/tests/test_cuda.py b/tests/test_cuda.py
index 3a5b1cfe9..5e22cd693 100644
--- a/tests/test_cuda.py
+++ b/tests/test_cuda.py
@@ -1,10 +1,24 @@
+import mock
 import pytest
+from schema_salad.avro import schema
 
+from cwltool.builder import Builder
+from cwltool.context import LoadingContext, RuntimeContext
 from cwltool.cuda import cuda_version_and_device_count
+from cwltool.errors import WorkflowException
+from cwltool.job import CommandLineJob
+from cwltool.load_tool import load_tool
 from cwltool.main import main
+from cwltool.pathmapper import MapperEnt, PathMapper
+from cwltool.process import use_custom_schema, use_standard_schema
+from cwltool.stdfsaccess import StdFsAccess
+from cwltool.update import INTERNAL_VERSION, ORIGINAL_CWLVERSION
+from cwltool.utils import CWLObjectType
 
 from .util import get_data, needs_docker, needs_singularity_3_or_newer
 
+from unittest.mock import MagicMock
+
 cuda_version = cuda_version_and_device_count()
 
 
@@ -39,7 +53,127 @@ def test_cuda_singularity() -> None:
 def test_cuda_no_container() -> None:
     params = [
         "--enable-ext",
-        "--singularity",
         get_data("tests/wf/nvidia-smi.cwl"),
     ]
     assert main(params) == 0
+
+
+@pytest.mark.skipif(
+    cuda_version[0] == "", reason="nvidia-smi required for CUDA not detected"
+)
+def test_cuda_cc_list() -> None:
+    params = [
+        "--enable-ext",
+        get_data("tests/wf/nvidia-smi-cc.cwl"),
+    ]
+    assert main(params) == 0
+
+
+def _makebuilder(cudaReq: CWLObjectType) -> Builder:
+    return Builder(
+        {},
+        [],
+        [],
+        {},
+        schema.Names(),
+        [cudaReq],
+        [],
+        {"cudaDeviceCount": 1},
+        None,
+        None,
+        StdFsAccess,
+        StdFsAccess(""),
+        None,
+        0.1,
+        False,
+        False,
+        False,
+        "",
+        "",
+        "",
+        "",
+        INTERNAL_VERSION,
+        "docker",
+    )
+
+
+@mock.patch("subprocess.check_output")
+@mock.patch("os.makedirs")
+def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> None:
+
+    runtime_context = RuntimeContext({})
+
+    cudaReq: CWLObjectType = {
+        "class": "http://commonwl.org/cwltool#CUDARequirement",
+        "cudaVersionMin": "1.0",
+        "cudaComputeCapability": "1.0",
+    }
+    builder = _makebuilder(cudaReq)
+
+    check_output.return_value = """
+<nvidia>
+<attached_gpus>1</attached_gpus>
+<cuda_version>1.0</cuda_version>
+</nvidia>
+"""
+
+    jb = CommandLineJob(builder, {}, PathMapper, [], [], "")
+    jb._setup(runtime_context)
+
+
+@mock.patch("subprocess.check_output")
+@mock.patch("os.makedirs")
+def test_cuda_job_setup_check_err(makedirs: MagicMock, check_output: MagicMock) -> None:
+
+    runtime_context = RuntimeContext({})
+
+    cudaReq: CWLObjectType = {
+        "class": "http://commonwl.org/cwltool#CUDARequirement",
+        "cudaVersionMin": "2.0",
+        "cudaComputeCapability": "1.0",
+    }
+    builder = _makebuilder(cudaReq)
+
+    check_output.return_value = """
+<nvidia>
+<attached_gpus>1</attached_gpus>
+<cuda_version>1.0</cuda_version>
+</nvidia>
+"""
+    jb = CommandLineJob(builder, {}, PathMapper, [], [], "")
+    with pytest.raises(WorkflowException):
+        jb._setup(runtime_context)
+
+
+def test_cuda_eval_resource_range() -> None:
+    with open(get_data("cwltool/extensions-v1.1.yml")) as res:
+        use_custom_schema("v1.2", "http://commonwl.org/cwltool", res.read())
+
+    joborder = {}  # type: CWLObjectType
+    loadingContext = LoadingContext({"do_update": True})
+    runtime_context = RuntimeContext({})
+
+    tool = load_tool(get_data("tests/wf/nvidia-smi-range.cwl"), loadingContext)
+    builder = _makebuilder(tool.requirements[0])
+    builder.job = joborder
+
+    resources = tool.evalResources(builder, runtime_context)
+
+    assert resources["cudaDeviceCount"] == 2
+
+
+def test_cuda_eval_resource_max() -> None:
+    with open(get_data("cwltool/extensions-v1.1.yml")) as res:
+        use_custom_schema("v1.2", "http://commonwl.org/cwltool", res.read())
+
+    joborder = {}  # type: CWLObjectType
+    loadingContext = LoadingContext({"do_update": True})
+    runtime_context = RuntimeContext({})
+
+    tool = load_tool(get_data("tests/wf/nvidia-smi-max.cwl"), loadingContext)
+    builder = _makebuilder(tool.requirements[0])
+    builder.job = joborder
+
+    resources = tool.evalResources(builder, runtime_context)
+
+    assert resources["cudaDeviceCount"] == 4
diff --git a/tests/wf/nvidia-smi-cc.cwl b/tests/wf/nvidia-smi-cc.cwl
new file mode 100644
index 000000000..a4f315b0e
--- /dev/null
+++ b/tests/wf/nvidia-smi-cc.cwl
@@ -0,0 +1,17 @@
+cwlVersion: v1.2
+class: CommandLineTool
+$namespaces:
+  cwltool: "http://commonwl.org/cwltool#"
+requirements:
+  cwltool:CUDARequirement:
+    cudaVersionMin: "1.0"
+    cudaComputeCapability: ["1.0", "2.0", "3.0"]
+    cudaDeviceCountMin: $(inputs.gpus)
+inputs:
+  gpus:
+    type: int
+    default: 1
+outputs: []
+# Assume this will exit non-zero (resulting in a failing test case) if
+# nvidia-smi doesn't detect any devices.
+baseCommand: "nvidia-smi"
diff --git a/tests/wf/nvidia-smi-container.cwl b/tests/wf/nvidia-smi-container.cwl
index 0ef0296a2..84fd72d83 100644
--- a/tests/wf/nvidia-smi-container.cwl
+++ b/tests/wf/nvidia-smi-container.cwl
@@ -5,7 +5,7 @@ $namespaces:
 requirements:
   cwltool:CUDARequirement:
     cudaVersionMin: "1.0"
-    cudaComputeCapabilityMin: "1.0"
+    cudaComputeCapability: "1.0"
   DockerRequirement:
     dockerPull: "nvidia/cuda:11.4.2-runtime-ubuntu20.04"
 inputs: []
diff --git a/tests/wf/nvidia-smi-max.cwl b/tests/wf/nvidia-smi-max.cwl
new file mode 100644
index 000000000..d3d4d5e9c
--- /dev/null
+++ b/tests/wf/nvidia-smi-max.cwl
@@ -0,0 +1,14 @@
+cwlVersion: v1.2
+class: CommandLineTool
+$namespaces:
+  cwltool: "http://commonwl.org/cwltool#"
+requirements:
+  cwltool:CUDARequirement:
+    cudaVersionMin: "1.0"
+    cudaComputeCapability: "1.0"
+    cudaDeviceCountMax: 4
+inputs: []
+outputs: []
+# Assume this will exit non-zero (resulting in a failing test case) if
+# nvidia-smi doesn't detect any devices.
+baseCommand: "nvidia-smi"
diff --git a/tests/wf/nvidia-smi-range.cwl b/tests/wf/nvidia-smi-range.cwl
new file mode 100644
index 000000000..19d3ea43c
--- /dev/null
+++ b/tests/wf/nvidia-smi-range.cwl
@@ -0,0 +1,15 @@
+cwlVersion: v1.2
+class: CommandLineTool
+$namespaces:
+  cwltool: "http://commonwl.org/cwltool#"
+requirements:
+  cwltool:CUDARequirement:
+    cudaVersionMin: "1.0"
+    cudaComputeCapability: "1.0"
+    cudaDeviceCountMin: 2
+    cudaDeviceCountMax: 4
+inputs: []
+outputs: []
+# Assume this will exit non-zero (resulting in a failing test case) if
+# nvidia-smi doesn't detect any devices.
+baseCommand: "nvidia-smi"
diff --git a/tests/wf/nvidia-smi.cwl b/tests/wf/nvidia-smi.cwl
index 88c72ddc6..8e227d0c5 100644
--- a/tests/wf/nvidia-smi.cwl
+++ b/tests/wf/nvidia-smi.cwl
@@ -5,8 +5,12 @@ $namespaces:
 requirements:
   cwltool:CUDARequirement:
     cudaVersionMin: "1.0"
-    cudaComputeCapabilityMin: "1.0"
-inputs: []
+    cudaComputeCapability: "1.0"
+    cudaDeviceCountMin: $(inputs.gpus)
+inputs:
+  gpus:
+    type: int
+    default: 1
 outputs: []
 # Assume this will exit non-zero (resulting in a failing test case) if
 # nvidia-smi doesn't detect any devices.