From 3d2d1196aab9863b7222456df73c044e5a8a5643 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 24 Feb 2022 03:58:55 -0500 Subject: [PATCH] Cuda request gpu device expr (#1629) * Allow expressions to dynamically request min/max number of GPUs. * Add test coverage for CUDA checks --- cwltool/cuda.py | 10 +-- cwltool/docker.py | 9 +- cwltool/executors.py | 3 + cwltool/extensions-v1.1.yml | 34 ++++++-- cwltool/extensions.yml | 34 ++++++-- cwltool/job.py | 5 +- cwltool/process.py | 32 +++++-- cwltool/singularity.py | 6 +- mypy-requirements.txt | 1 + tests/test_cuda.py | 136 +++++++++++++++++++++++++++++- tests/wf/nvidia-smi-cc.cwl | 17 ++++ tests/wf/nvidia-smi-container.cwl | 2 +- tests/wf/nvidia-smi-max.cwl | 14 +++ tests/wf/nvidia-smi-range.cwl | 15 ++++ tests/wf/nvidia-smi.cwl | 8 +- 15 files changed, 279 insertions(+), 47 deletions(-) create mode 100644 tests/wf/nvidia-smi-cc.cwl create mode 100644 tests/wf/nvidia-smi-max.cwl create mode 100644 tests/wf/nvidia-smi-range.cwl diff --git a/cwltool/cuda.py b/cwltool/cuda.py index 8118428f1..65dc19c10 100644 --- a/cwltool/cuda.py +++ b/cwltool/cuda.py @@ -18,7 +18,7 @@ def cuda_version_and_device_count() -> Tuple[str, int]: return (cv.data, int(ag.data)) -def cuda_check(cuda_req: CWLObjectType) -> int: +def cuda_check(cuda_req: CWLObjectType, requestCount: int) -> int: try: vmin = float(str(cuda_req["cudaVersionMin"])) version, devices = cuda_version_and_device_count() @@ -31,14 +31,12 @@ def cuda_check(cuda_req: CWLObjectType) -> int: "CUDA version '%s' is less than minimum version '%s'", version, vmin ) return 0 - dmin = cast(int, cuda_req.get("deviceCountMin", 1)) - dmax = cast(int, cuda_req.get("deviceCountMax", dmin)) - if devices < dmin: + if requestCount > devices: _logger.warning( - "Requested at least %d GPU devices but only %d available", dmin, devices + "Requested %d GPU devices but only %d available", requestCount, devices ) return 0 - return min(dmax, devices) + return requestCount except Exception as e: _logger.warning("Error checking CUDA requirements: %s", e) return 0 diff --git a/cwltool/docker.py b/cwltool/docker.py index 2784ef593..69f194cca 100644 --- a/cwltool/docker.py +++ b/cwltool/docker.py @@ -397,13 +397,8 @@ def create_runtime( if runtimeContext.rm_container: runtime.append("--rm") - cuda_req, _ = self.builder.get_requirement( - "http://commonwl.org/cwltool#CUDARequirement" - ) - if cuda_req: - # Checked earlier that the device count is non-zero in _setup - count = cuda_check(cuda_req) - runtime.append("--gpus=" + str(count)) + if self.builder.resources.get("cudaDeviceCount"): + runtime.append("--gpus=" + str(self.builder.resources["cudaDeviceCount"])) cidfile_path = None # type: Optional[str] # add parameters to docker to write a container ID file diff --git a/cwltool/executors.py b/cwltool/executors.py index ab1a19602..32d3e4fe0 100644 --- a/cwltool/executors.py +++ b/cwltool/executors.py @@ -305,6 +305,9 @@ def select_resources( result["tmpdirSize"] = math.ceil(request["tmpdirMin"]) result["outdirSize"] = math.ceil(request["outdirMin"]) + if "cudaDeviceCount" in request: + result["cudaDeviceCount"] = request["cudaDeviceCount"] + return result def _runner(self, job, runtime_context, TMPDIR_LOCK): diff --git a/cwltool/extensions-v1.1.yml b/cwltool/extensions-v1.1.yml index 81c24983e..603c40f05 100644 --- a/cwltool/extensions-v1.1.yml +++ b/cwltool/extensions-v1.1.yml @@ -93,13 +93,29 @@ $graph: See https://docs.nvidia.com/deploy/cuda-compatibility/ for details. - cudaComputeCapabilityMin: - type: string - doc: Minimum CUDA hardware capability required to run the software, in X.Y format. - deviceCountMin: - type: int? + cudaComputeCapability: + type: + - 'string' + - 'string[]' + doc: | + CUDA hardware capability required to run the software, in X.Y + format. + + * If this is a single value, it defines only the minimum + compute capability. GPUs with higher capability are also + accepted. + + * If it is an array value, then only select GPUs with compute + capabilities that explicitly appear in the array. + cudaDeviceCountMin: + type: ['null', int, cwl:Expression] default: 1 - doc: Minimum number of GPU devices to request, default 1. - deviceCountMax: - type: int? - doc: Maximum number of GPU devices to request. If not specified, same as `deviceCountMin`. + doc: | + Minimum number of GPU devices to request. If not specified, + same as `cudaDeviceCountMax`. If neither are specified, + default 1. + cudaDeviceCountMax: + type: ['null', int, cwl:Expression] + doc: | + Maximum number of GPU devices to request. If not specified, + same as `cudaDeviceCountMin`. diff --git a/cwltool/extensions.yml b/cwltool/extensions.yml index 17260ee5e..6c36760fa 100644 --- a/cwltool/extensions.yml +++ b/cwltool/extensions.yml @@ -203,13 +203,29 @@ $graph: See https://docs.nvidia.com/deploy/cuda-compatibility/ for details. - cudaComputeCapabilityMin: - type: string - doc: Minimum CUDA hardware capability required to run the software, in X.Y format. - deviceCountMin: - type: int? + cudaComputeCapability: + type: + - 'string' + - 'string[]' + doc: | + CUDA hardware capability required to run the software, in X.Y + format. + + * If this is a single value, it defines only the minimum + compute capability. GPUs with higher capability are also + accepted. + + * If it is an array value, then only select GPUs with compute + capabilities that explicitly appear in the array. + cudaDeviceCountMin: + type: ['null', int, cwl:Expression] default: 1 - doc: Minimum number of GPU devices to request, default 1. - deviceCountMax: - type: int? - doc: Maximum number of GPU devices to request. If not specified, same as `deviceCountMin`. + doc: | + Minimum number of GPU devices to request. If not specified, + same as `cudaDeviceCountMax`. If neither are specified, + default 1. + cudaDeviceCountMax: + type: ['null', int, cwl:Expression] + doc: | + Maximum number of GPU devices to request. If not specified, + same as `cudaDeviceCountMin`. diff --git a/cwltool/job.py b/cwltool/job.py index 38a928929..c360dac84 100644 --- a/cwltool/job.py +++ b/cwltool/job.py @@ -2,6 +2,7 @@ import functools import itertools import logging +import math import os import re import shutil @@ -180,7 +181,9 @@ def _setup(self, runtimeContext: RuntimeContext) -> None: "http://commonwl.org/cwltool#CUDARequirement" ) if cuda_req: - count = cuda_check(cuda_req) + count = cuda_check( + cuda_req, math.ceil(self.builder.resources["cudaDeviceCount"]) + ) if count == 0: raise WorkflowException("Could not satisfy CUDARequirement") diff --git a/cwltool/process.py b/cwltool/process.py index d6975e5ce..2c7ce6af9 100644 --- a/cwltool/process.py +++ b/cwltool/process.py @@ -980,6 +980,7 @@ def evalResources( resourceReq, _ = self.get_requirement("ResourceRequirement") if resourceReq is None: resourceReq = {} + cwl_version = self.metadata.get(ORIGINAL_CWLVERSION, None) if cwl_version == "v1.0": ram = 1024 @@ -995,20 +996,34 @@ def evalResources( "outdirMin": 1024, "outdirMax": 1024, } - for a in ("cores", "ram", "tmpdir", "outdir"): + + cudaReq, _ = self.get_requirement("http://commonwl.org/cwltool#CUDARequirement") + if cudaReq: + request["cudaDeviceCountMin"] = 1 + request["cudaDeviceCountMax"] = 1 + + for rsc, a in ( + (resourceReq, "cores"), + (resourceReq, "ram"), + (resourceReq, "tmpdir"), + (resourceReq, "outdir"), + (cudaReq, "cudaDeviceCount"), + ): + if rsc is None: + continue mn = mx = None # type: Optional[Union[int, float]] - if resourceReq.get(a + "Min"): + if rsc.get(a + "Min"): mn = cast( Union[int, float], eval_resource( - builder, cast(Union[str, int, float], resourceReq[a + "Min"]) + builder, cast(Union[str, int, float], rsc[a + "Min"]) ), ) - if resourceReq.get(a + "Max"): + if rsc.get(a + "Max"): mx = cast( Union[int, float], eval_resource( - builder, cast(Union[str, int, float], resourceReq[a + "Max"]) + builder, cast(Union[str, int, float], rsc[a + "Max"]) ), ) if mn is None: @@ -1022,13 +1037,18 @@ def evalResources( request_evaluated = cast(Dict[str, Union[int, float]], request) if runtimeContext.select_resources is not None: + # Call select resources hook return runtimeContext.select_resources(request_evaluated, runtimeContext) - return { + + defaultReq = { "cores": request_evaluated["coresMin"], "ram": math.ceil(request_evaluated["ramMin"]), "tmpdirSize": math.ceil(request_evaluated["tmpdirMin"]), "outdirSize": math.ceil(request_evaluated["outdirMin"]), } + if cudaReq: + defaultReq["cudaDeviceCount"] = request_evaluated["cudaDeviceCountMin"] + return defaultReq def validate_hints( self, avsc_names: Names, hints: List[CWLObjectType], strict: bool diff --git a/cwltool/singularity.py b/cwltool/singularity.py index 7d4d5b66a..4d9cacca8 100644 --- a/cwltool/singularity.py +++ b/cwltool/singularity.py @@ -434,11 +434,7 @@ def create_runtime( else: runtime.extend(["--net", "--network", "none"]) - cuda_req, _ = self.builder.get_requirement( - "http://commonwl.org/cwltool#CUDARequirement" - ) - if cuda_req: - # Checked earlier that the device count is non-zero in _setup + if self.builder.resources.get("cudaDeviceCount"): runtime.append("--nv") for name, value in self.environment.items(): diff --git a/mypy-requirements.txt b/mypy-requirements.txt index c3461b0a5..23611340f 100644 --- a/mypy-requirements.txt +++ b/mypy-requirements.txt @@ -2,3 +2,4 @@ mypy==0.931 types-requests types-setuptools types-psutil +types-mock diff --git a/tests/test_cuda.py b/tests/test_cuda.py index 3a5b1cfe9..5e22cd693 100644 --- a/tests/test_cuda.py +++ b/tests/test_cuda.py @@ -1,10 +1,24 @@ +import mock import pytest +from schema_salad.avro import schema +from cwltool.builder import Builder +from cwltool.context import LoadingContext, RuntimeContext from cwltool.cuda import cuda_version_and_device_count +from cwltool.errors import WorkflowException +from cwltool.job import CommandLineJob +from cwltool.load_tool import load_tool from cwltool.main import main +from cwltool.pathmapper import MapperEnt, PathMapper +from cwltool.process import use_custom_schema, use_standard_schema +from cwltool.stdfsaccess import StdFsAccess +from cwltool.update import INTERNAL_VERSION, ORIGINAL_CWLVERSION +from cwltool.utils import CWLObjectType from .util import get_data, needs_docker, needs_singularity_3_or_newer +from unittest.mock import MagicMock + cuda_version = cuda_version_and_device_count() @@ -39,7 +53,127 @@ def test_cuda_singularity() -> None: def test_cuda_no_container() -> None: params = [ "--enable-ext", - "--singularity", get_data("tests/wf/nvidia-smi.cwl"), ] assert main(params) == 0 + + +@pytest.mark.skipif( + cuda_version[0] == "", reason="nvidia-smi required for CUDA not detected" +) +def test_cuda_cc_list() -> None: + params = [ + "--enable-ext", + get_data("tests/wf/nvidia-smi-cc.cwl"), + ] + assert main(params) == 0 + + +def _makebuilder(cudaReq: CWLObjectType) -> Builder: + return Builder( + {}, + [], + [], + {}, + schema.Names(), + [cudaReq], + [], + {"cudaDeviceCount": 1}, + None, + None, + StdFsAccess, + StdFsAccess(""), + None, + 0.1, + False, + False, + False, + "", + "", + "", + "", + INTERNAL_VERSION, + "docker", + ) + + +@mock.patch("subprocess.check_output") +@mock.patch("os.makedirs") +def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> None: + + runtime_context = RuntimeContext({}) + + cudaReq: CWLObjectType = { + "class": "http://commonwl.org/cwltool#CUDARequirement", + "cudaVersionMin": "1.0", + "cudaComputeCapability": "1.0", + } + builder = _makebuilder(cudaReq) + + check_output.return_value = """ + +1 +1.0 + +""" + + jb = CommandLineJob(builder, {}, PathMapper, [], [], "") + jb._setup(runtime_context) + + +@mock.patch("subprocess.check_output") +@mock.patch("os.makedirs") +def test_cuda_job_setup_check_err(makedirs: MagicMock, check_output: MagicMock) -> None: + + runtime_context = RuntimeContext({}) + + cudaReq: CWLObjectType = { + "class": "http://commonwl.org/cwltool#CUDARequirement", + "cudaVersionMin": "2.0", + "cudaComputeCapability": "1.0", + } + builder = _makebuilder(cudaReq) + + check_output.return_value = """ + +1 +1.0 + +""" + jb = CommandLineJob(builder, {}, PathMapper, [], [], "") + with pytest.raises(WorkflowException): + jb._setup(runtime_context) + + +def test_cuda_eval_resource_range() -> None: + with open(get_data("cwltool/extensions-v1.1.yml")) as res: + use_custom_schema("v1.2", "http://commonwl.org/cwltool", res.read()) + + joborder = {} # type: CWLObjectType + loadingContext = LoadingContext({"do_update": True}) + runtime_context = RuntimeContext({}) + + tool = load_tool(get_data("tests/wf/nvidia-smi-range.cwl"), loadingContext) + builder = _makebuilder(tool.requirements[0]) + builder.job = joborder + + resources = tool.evalResources(builder, runtime_context) + + assert resources["cudaDeviceCount"] == 2 + + +def test_cuda_eval_resource_max() -> None: + with open(get_data("cwltool/extensions-v1.1.yml")) as res: + use_custom_schema("v1.2", "http://commonwl.org/cwltool", res.read()) + + joborder = {} # type: CWLObjectType + loadingContext = LoadingContext({"do_update": True}) + runtime_context = RuntimeContext({}) + + tool = load_tool(get_data("tests/wf/nvidia-smi-max.cwl"), loadingContext) + builder = _makebuilder(tool.requirements[0]) + builder.job = joborder + + resources = tool.evalResources(builder, runtime_context) + + assert resources["cudaDeviceCount"] == 4 diff --git a/tests/wf/nvidia-smi-cc.cwl b/tests/wf/nvidia-smi-cc.cwl new file mode 100644 index 000000000..a4f315b0e --- /dev/null +++ b/tests/wf/nvidia-smi-cc.cwl @@ -0,0 +1,17 @@ +cwlVersion: v1.2 +class: CommandLineTool +$namespaces: + cwltool: "http://commonwl.org/cwltool#" +requirements: + cwltool:CUDARequirement: + cudaVersionMin: "1.0" + cudaComputeCapability: ["1.0", "2.0", "3.0"] + cudaDeviceCountMin: $(inputs.gpus) +inputs: + gpus: + type: int + default: 1 +outputs: [] +# Assume this will exit non-zero (resulting in a failing test case) if +# nvidia-smi doesn't detect any devices. +baseCommand: "nvidia-smi" diff --git a/tests/wf/nvidia-smi-container.cwl b/tests/wf/nvidia-smi-container.cwl index 0ef0296a2..84fd72d83 100644 --- a/tests/wf/nvidia-smi-container.cwl +++ b/tests/wf/nvidia-smi-container.cwl @@ -5,7 +5,7 @@ $namespaces: requirements: cwltool:CUDARequirement: cudaVersionMin: "1.0" - cudaComputeCapabilityMin: "1.0" + cudaComputeCapability: "1.0" DockerRequirement: dockerPull: "nvidia/cuda:11.4.2-runtime-ubuntu20.04" inputs: [] diff --git a/tests/wf/nvidia-smi-max.cwl b/tests/wf/nvidia-smi-max.cwl new file mode 100644 index 000000000..d3d4d5e9c --- /dev/null +++ b/tests/wf/nvidia-smi-max.cwl @@ -0,0 +1,14 @@ +cwlVersion: v1.2 +class: CommandLineTool +$namespaces: + cwltool: "http://commonwl.org/cwltool#" +requirements: + cwltool:CUDARequirement: + cudaVersionMin: "1.0" + cudaComputeCapability: "1.0" + cudaDeviceCountMax: 4 +inputs: [] +outputs: [] +# Assume this will exit non-zero (resulting in a failing test case) if +# nvidia-smi doesn't detect any devices. +baseCommand: "nvidia-smi" diff --git a/tests/wf/nvidia-smi-range.cwl b/tests/wf/nvidia-smi-range.cwl new file mode 100644 index 000000000..19d3ea43c --- /dev/null +++ b/tests/wf/nvidia-smi-range.cwl @@ -0,0 +1,15 @@ +cwlVersion: v1.2 +class: CommandLineTool +$namespaces: + cwltool: "http://commonwl.org/cwltool#" +requirements: + cwltool:CUDARequirement: + cudaVersionMin: "1.0" + cudaComputeCapability: "1.0" + cudaDeviceCountMin: 2 + cudaDeviceCountMax: 4 +inputs: [] +outputs: [] +# Assume this will exit non-zero (resulting in a failing test case) if +# nvidia-smi doesn't detect any devices. +baseCommand: "nvidia-smi" diff --git a/tests/wf/nvidia-smi.cwl b/tests/wf/nvidia-smi.cwl index 88c72ddc6..8e227d0c5 100644 --- a/tests/wf/nvidia-smi.cwl +++ b/tests/wf/nvidia-smi.cwl @@ -5,8 +5,12 @@ $namespaces: requirements: cwltool:CUDARequirement: cudaVersionMin: "1.0" - cudaComputeCapabilityMin: "1.0" -inputs: [] + cudaComputeCapability: "1.0" + cudaDeviceCountMin: $(inputs.gpus) +inputs: + gpus: + type: int + default: 1 outputs: [] # Assume this will exit non-zero (resulting in a failing test case) if # nvidia-smi doesn't detect any devices.