Skip to content

Commit

Permalink
Cuda request gpu device expr (#1629)
Browse files Browse the repository at this point in the history
* Allow expressions to dynamically request min/max number of GPUs.

* Add test coverage for CUDA checks
  • Loading branch information
tetron authored Feb 24, 2022
1 parent 4c65bdf commit 3d2d119
Show file tree
Hide file tree
Showing 15 changed files with 279 additions and 47 deletions.
10 changes: 4 additions & 6 deletions cwltool/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def cuda_version_and_device_count() -> Tuple[str, int]:
return (cv.data, int(ag.data))


def cuda_check(cuda_req: CWLObjectType) -> int:
def cuda_check(cuda_req: CWLObjectType, requestCount: int) -> int:
try:
vmin = float(str(cuda_req["cudaVersionMin"]))
version, devices = cuda_version_and_device_count()
Expand All @@ -31,14 +31,12 @@ def cuda_check(cuda_req: CWLObjectType) -> int:
"CUDA version '%s' is less than minimum version '%s'", version, vmin
)
return 0
dmin = cast(int, cuda_req.get("deviceCountMin", 1))
dmax = cast(int, cuda_req.get("deviceCountMax", dmin))
if devices < dmin:
if requestCount > devices:
_logger.warning(
"Requested at least %d GPU devices but only %d available", dmin, devices
"Requested %d GPU devices but only %d available", requestCount, devices
)
return 0
return min(dmax, devices)
return requestCount
except Exception as e:
_logger.warning("Error checking CUDA requirements: %s", e)
return 0
9 changes: 2 additions & 7 deletions cwltool/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,13 +397,8 @@ def create_runtime(
if runtimeContext.rm_container:
runtime.append("--rm")

cuda_req, _ = self.builder.get_requirement(
"http://commonwl.org/cwltool#CUDARequirement"
)
if cuda_req:
# Checked earlier that the device count is non-zero in _setup
count = cuda_check(cuda_req)
runtime.append("--gpus=" + str(count))
if self.builder.resources.get("cudaDeviceCount"):
runtime.append("--gpus=" + str(self.builder.resources["cudaDeviceCount"]))

cidfile_path = None # type: Optional[str]
# add parameters to docker to write a container ID file
Expand Down
3 changes: 3 additions & 0 deletions cwltool/executors.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,9 @@ def select_resources(
result["tmpdirSize"] = math.ceil(request["tmpdirMin"])
result["outdirSize"] = math.ceil(request["outdirMin"])

if "cudaDeviceCount" in request:
result["cudaDeviceCount"] = request["cudaDeviceCount"]

return result

def _runner(self, job, runtime_context, TMPDIR_LOCK):
Expand Down
34 changes: 25 additions & 9 deletions cwltool/extensions-v1.1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,29 @@ $graph:
See https://docs.nvidia.com/deploy/cuda-compatibility/ for
details.
cudaComputeCapabilityMin:
type: string
doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
deviceCountMin:
type: int?
cudaComputeCapability:
type:
- 'string'
- 'string[]'
doc: |
CUDA hardware capability required to run the software, in X.Y
format.
* If this is a single value, it defines only the minimum
compute capability. GPUs with higher capability are also
accepted.
* If it is an array value, then only select GPUs with compute
capabilities that explicitly appear in the array.
cudaDeviceCountMin:
type: ['null', int, cwl:Expression]
default: 1
doc: Minimum number of GPU devices to request, default 1.
deviceCountMax:
type: int?
doc: Maximum number of GPU devices to request. If not specified, same as `deviceCountMin`.
doc: |
Minimum number of GPU devices to request. If not specified,
same as `cudaDeviceCountMax`. If neither are specified,
default 1.
cudaDeviceCountMax:
type: ['null', int, cwl:Expression]
doc: |
Maximum number of GPU devices to request. If not specified,
same as `cudaDeviceCountMin`.
34 changes: 25 additions & 9 deletions cwltool/extensions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -203,13 +203,29 @@ $graph:
See https://docs.nvidia.com/deploy/cuda-compatibility/ for
details.
cudaComputeCapabilityMin:
type: string
doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
deviceCountMin:
type: int?
cudaComputeCapability:
type:
- 'string'
- 'string[]'
doc: |
CUDA hardware capability required to run the software, in X.Y
format.
* If this is a single value, it defines only the minimum
compute capability. GPUs with higher capability are also
accepted.
* If it is an array value, then only select GPUs with compute
capabilities that explicitly appear in the array.
cudaDeviceCountMin:
type: ['null', int, cwl:Expression]
default: 1
doc: Minimum number of GPU devices to request, default 1.
deviceCountMax:
type: int?
doc: Maximum number of GPU devices to request. If not specified, same as `deviceCountMin`.
doc: |
Minimum number of GPU devices to request. If not specified,
same as `cudaDeviceCountMax`. If neither are specified,
default 1.
cudaDeviceCountMax:
type: ['null', int, cwl:Expression]
doc: |
Maximum number of GPU devices to request. If not specified,
same as `cudaDeviceCountMin`.
5 changes: 4 additions & 1 deletion cwltool/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import functools
import itertools
import logging
import math
import os
import re
import shutil
Expand Down Expand Up @@ -180,7 +181,9 @@ def _setup(self, runtimeContext: RuntimeContext) -> None:
"http://commonwl.org/cwltool#CUDARequirement"
)
if cuda_req:
count = cuda_check(cuda_req)
count = cuda_check(
cuda_req, math.ceil(self.builder.resources["cudaDeviceCount"])
)
if count == 0:
raise WorkflowException("Could not satisfy CUDARequirement")

Expand Down
32 changes: 26 additions & 6 deletions cwltool/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -980,6 +980,7 @@ def evalResources(
resourceReq, _ = self.get_requirement("ResourceRequirement")
if resourceReq is None:
resourceReq = {}

cwl_version = self.metadata.get(ORIGINAL_CWLVERSION, None)
if cwl_version == "v1.0":
ram = 1024
Expand All @@ -995,20 +996,34 @@ def evalResources(
"outdirMin": 1024,
"outdirMax": 1024,
}
for a in ("cores", "ram", "tmpdir", "outdir"):

cudaReq, _ = self.get_requirement("http://commonwl.org/cwltool#CUDARequirement")
if cudaReq:
request["cudaDeviceCountMin"] = 1
request["cudaDeviceCountMax"] = 1

for rsc, a in (
(resourceReq, "cores"),
(resourceReq, "ram"),
(resourceReq, "tmpdir"),
(resourceReq, "outdir"),
(cudaReq, "cudaDeviceCount"),
):
if rsc is None:
continue
mn = mx = None # type: Optional[Union[int, float]]
if resourceReq.get(a + "Min"):
if rsc.get(a + "Min"):
mn = cast(
Union[int, float],
eval_resource(
builder, cast(Union[str, int, float], resourceReq[a + "Min"])
builder, cast(Union[str, int, float], rsc[a + "Min"])
),
)
if resourceReq.get(a + "Max"):
if rsc.get(a + "Max"):
mx = cast(
Union[int, float],
eval_resource(
builder, cast(Union[str, int, float], resourceReq[a + "Max"])
builder, cast(Union[str, int, float], rsc[a + "Max"])
),
)
if mn is None:
Expand All @@ -1022,13 +1037,18 @@ def evalResources(

request_evaluated = cast(Dict[str, Union[int, float]], request)
if runtimeContext.select_resources is not None:
# Call select resources hook
return runtimeContext.select_resources(request_evaluated, runtimeContext)
return {

defaultReq = {
"cores": request_evaluated["coresMin"],
"ram": math.ceil(request_evaluated["ramMin"]),
"tmpdirSize": math.ceil(request_evaluated["tmpdirMin"]),
"outdirSize": math.ceil(request_evaluated["outdirMin"]),
}
if cudaReq:
defaultReq["cudaDeviceCount"] = request_evaluated["cudaDeviceCountMin"]
return defaultReq

def validate_hints(
self, avsc_names: Names, hints: List[CWLObjectType], strict: bool
Expand Down
6 changes: 1 addition & 5 deletions cwltool/singularity.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,11 +434,7 @@ def create_runtime(
else:
runtime.extend(["--net", "--network", "none"])

cuda_req, _ = self.builder.get_requirement(
"http://commonwl.org/cwltool#CUDARequirement"
)
if cuda_req:
# Checked earlier that the device count is non-zero in _setup
if self.builder.resources.get("cudaDeviceCount"):
runtime.append("--nv")

for name, value in self.environment.items():
Expand Down
1 change: 1 addition & 0 deletions mypy-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ mypy==0.931
types-requests
types-setuptools
types-psutil
types-mock
136 changes: 135 additions & 1 deletion tests/test_cuda.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,24 @@
import mock
import pytest
from schema_salad.avro import schema

from cwltool.builder import Builder
from cwltool.context import LoadingContext, RuntimeContext
from cwltool.cuda import cuda_version_and_device_count
from cwltool.errors import WorkflowException
from cwltool.job import CommandLineJob
from cwltool.load_tool import load_tool
from cwltool.main import main
from cwltool.pathmapper import MapperEnt, PathMapper
from cwltool.process import use_custom_schema, use_standard_schema
from cwltool.stdfsaccess import StdFsAccess
from cwltool.update import INTERNAL_VERSION, ORIGINAL_CWLVERSION
from cwltool.utils import CWLObjectType

from .util import get_data, needs_docker, needs_singularity_3_or_newer

from unittest.mock import MagicMock

cuda_version = cuda_version_and_device_count()


Expand Down Expand Up @@ -39,7 +53,127 @@ def test_cuda_singularity() -> None:
def test_cuda_no_container() -> None:
params = [
"--enable-ext",
"--singularity",
get_data("tests/wf/nvidia-smi.cwl"),
]
assert main(params) == 0


@pytest.mark.skipif(
cuda_version[0] == "", reason="nvidia-smi required for CUDA not detected"
)
def test_cuda_cc_list() -> None:
params = [
"--enable-ext",
get_data("tests/wf/nvidia-smi-cc.cwl"),
]
assert main(params) == 0


def _makebuilder(cudaReq: CWLObjectType) -> Builder:
return Builder(
{},
[],
[],
{},
schema.Names(),
[cudaReq],
[],
{"cudaDeviceCount": 1},
None,
None,
StdFsAccess,
StdFsAccess(""),
None,
0.1,
False,
False,
False,
"",
"",
"",
"",
INTERNAL_VERSION,
"docker",
)


@mock.patch("subprocess.check_output")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> None:

runtime_context = RuntimeContext({})

cudaReq: CWLObjectType = {
"class": "http://commonwl.org/cwltool#CUDARequirement",
"cudaVersionMin": "1.0",
"cudaComputeCapability": "1.0",
}
builder = _makebuilder(cudaReq)

check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
<cuda_version>1.0</cuda_version>
</nvidia>
"""

jb = CommandLineJob(builder, {}, PathMapper, [], [], "")
jb._setup(runtime_context)


@mock.patch("subprocess.check_output")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err(makedirs: MagicMock, check_output: MagicMock) -> None:

runtime_context = RuntimeContext({})

cudaReq: CWLObjectType = {
"class": "http://commonwl.org/cwltool#CUDARequirement",
"cudaVersionMin": "2.0",
"cudaComputeCapability": "1.0",
}
builder = _makebuilder(cudaReq)

check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
<cuda_version>1.0</cuda_version>
</nvidia>
"""
jb = CommandLineJob(builder, {}, PathMapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)


def test_cuda_eval_resource_range() -> None:
with open(get_data("cwltool/extensions-v1.1.yml")) as res:
use_custom_schema("v1.2", "http://commonwl.org/cwltool", res.read())

joborder = {} # type: CWLObjectType
loadingContext = LoadingContext({"do_update": True})
runtime_context = RuntimeContext({})

tool = load_tool(get_data("tests/wf/nvidia-smi-range.cwl"), loadingContext)
builder = _makebuilder(tool.requirements[0])
builder.job = joborder

resources = tool.evalResources(builder, runtime_context)

assert resources["cudaDeviceCount"] == 2


def test_cuda_eval_resource_max() -> None:
with open(get_data("cwltool/extensions-v1.1.yml")) as res:
use_custom_schema("v1.2", "http://commonwl.org/cwltool", res.read())

joborder = {} # type: CWLObjectType
loadingContext = LoadingContext({"do_update": True})
runtime_context = RuntimeContext({})

tool = load_tool(get_data("tests/wf/nvidia-smi-max.cwl"), loadingContext)
builder = _makebuilder(tool.requirements[0])
builder.job = joborder

resources = tool.evalResources(builder, runtime_context)

assert resources["cudaDeviceCount"] == 4
Loading

0 comments on commit 3d2d119

Please sign in to comment.