Skip to content

Commit

Permalink
cuda: better check our expectations of the nvidia-smi -x XML output
Browse files Browse the repository at this point in the history
  • Loading branch information
mr-c committed May 13, 2023
1 parent 96c711a commit 51d91cf
Show file tree
Hide file tree
Showing 2 changed files with 177 additions and 3 deletions.
32 changes: 29 additions & 3 deletions cwltool/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,35 @@ def cuda_version_and_device_count() -> Tuple[str, int]:
_logger.warning("Error checking CUDA version with nvidia-smi: %s", e)
return ("", 0)
dm = xml.dom.minidom.parseString(out) # nosec
ag = dm.getElementsByTagName("attached_gpus")[0].firstChild
cv = dm.getElementsByTagName("cuda_version")[0].firstChild
return (cv.data, int(ag.data))

ag = dm.getElementsByTagName("attached_gpus")
if len(ag) < 1 or ag[0].firstChild is None:
_logger.warning(
"Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty.: %s",
out,
)
return ("", 0)
ag_element = ag[0].firstChild

cv = dm.getElementsByTagName("cuda_version")
if len(cv) < 1 or cv[0].firstChild is None:
_logger.warning(
"Error checking CUDA version with nvidia-smi. Missing 'cuda_version' or it is empty.: %s",
out,
)
return ("", 0)
cv_element = cv[0].firstChild

if isinstance(cv_element, xml.dom.minidom.Text) and isinstance(
ag_element, xml.dom.minidom.Text
):
return (cv_element.data, int(ag_element.data))
_logger.warning(
"Error checking CUDA version with nvidia-smi. "
"Either 'attached_gpus' or 'cuda_version' was not a text node: %s",
out,
)
return ("", 0)


def cuda_check(cuda_req: CWLObjectType, requestCount: int) -> int:
Expand Down
148 changes: 148 additions & 0 deletions tests/test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,154 @@ def test_cuda_job_setup_check_err(makedirs: MagicMock, check_output: MagicMock)
jb._setup(runtime_context)


@mock.patch("subprocess.check_output")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_empty_attached_gpus(
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})

cudaReq: CWLObjectType = {
"class": "http://commonwl.org/cwltool#CUDARequirement",
"cudaVersionMin": "1.0",
"cudaComputeCapability": "1.0",
}
builder = _makebuilder(cudaReq)

check_output.return_value = """
<nvidia>
<attached_gpus></attached_gpus>
<cuda_version>1.0</cuda_version>
</nvidia>
"""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
assert (
"Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty."
in caplog.text
)


@mock.patch("subprocess.check_output")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_empty_missing_attached_gpus(
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})

cudaReq: CWLObjectType = {
"class": "http://commonwl.org/cwltool#CUDARequirement",
"cudaVersionMin": "1.0",
"cudaComputeCapability": "1.0",
}
builder = _makebuilder(cudaReq)

check_output.return_value = """
<nvidia>
<cuda_version>1.0</cuda_version>
</nvidia>
"""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
assert (
"Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty."
in caplog.text
)


@mock.patch("subprocess.check_output")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_empty_cuda_version(
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})

cudaReq: CWLObjectType = {
"class": "http://commonwl.org/cwltool#CUDARequirement",
"cudaVersionMin": "1.0",
"cudaComputeCapability": "1.0",
}
builder = _makebuilder(cudaReq)

check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
<cuda_version></cuda_version>
</nvidia>
"""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
assert (
"Error checking CUDA version with nvidia-smi. Missing 'cuda_version' or it is empty."
in caplog.text
)


@mock.patch("subprocess.check_output")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_missing_cuda_version(
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})

cudaReq: CWLObjectType = {
"class": "http://commonwl.org/cwltool#CUDARequirement",
"cudaVersionMin": "1.0",
"cudaComputeCapability": "1.0",
}
builder = _makebuilder(cudaReq)

check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
</nvidia>
"""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
assert (
"Error checking CUDA version with nvidia-smi. Missing 'cuda_version' or it is empty."
in caplog.text
)


@mock.patch("subprocess.check_output")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_wrong_type_cuda_version(
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})

cudaReq: CWLObjectType = {
"class": "http://commonwl.org/cwltool#CUDARequirement",
"cudaVersionMin": "1.0",
"cudaComputeCapability": "1.0",
}
builder = _makebuilder(cudaReq)

check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
<cuda_version><subelement /></cuda_version>
</nvidia>
"""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
assert (
"Error checking CUDA version with nvidia-smi. "
"Either 'attached_gpus' or 'cuda_version' was not a text node" in caplog.text
)


def test_cuda_eval_resource_range() -> None:
with open(get_data("cwltool/extensions-v1.1.yml")) as res:
use_custom_schema("v1.2", "http://commonwl.org/cwltool", res.read())
Expand Down

0 comments on commit 51d91cf

Please sign in to comment.