Add Performance Thresholds to Test Runner (#36)

Tyler Titsworth · pre-commit-ci[bot] · web-flow · commit 5db8ff77fbca · 2024-06-06T22:21:01.000Z
Signed-off-by: Tyler Titsworth &lt;tyler.titsworth@intel.com&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@
 docs/assets
 docs/repos/
 logs/
+models-perf/
 output/
 site
 venv/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -72,12 +72,12 @@ repos:
     language: system
     name: pylint
     types: [python]
-  - entry: bash -c "python -m tox -e py310"
+  - entry: bash -c "python -m tox -e py310,clean"
     files: ^test-runner/
     id: tox
     language: system
     name: tox
-  - entry: bash -c "mkdocs build --clean"
+  - entry: bash -c "rm -rf site/ && mkdocs build --clean"
     # files: ^docs/
     id: mkdocs
     language: system
diff --git a/test-runner/README.md b/test-runner/README.md
@@ -30,6 +30,7 @@ A test is defined as a set of commands to be executed along with their associate
 | [volumes](https://github.com/compose-spec/compose-spec/blob/master/spec.md#volumes)    | Optional[List[[Volume](utils/test.py#L13)]] | A list of volumes to be mounted when running the test in a container. |
 | [env](https://github.com/compose-spec/compose-spec/blob/master/spec.md#environment)        | Optional[Dict[str, str]] | A list of environment variables to be set when the test is running. |
 | mask | Optional[List[str]] | A list of keys to [mask](#masking) in the test output. |
+| performance | Optional[str] | Check test performance thresholds in the format `perf/path/to/model.yaml:test-id` |
 | notebook   | Optional[str]  | A flag indicating whether the test utilizes a [jupyter notebook](#notebook-test). |
 | serving    | Optional[str]  | A flag indicating whether a [serving test](#serving-test) should be invoked. |
 | [cap_add](https://github.com/compose-spec/compose-spec/blob/master/spec.md#cap_add)    | Optional[str]  | Specifies additional container capabilities. |
@@ -75,12 +76,12 @@ In the example above, the first output will be `hello`, and the second output wi
 
 Masking is a feature that allows you to hide sensitive information in the logs generated by the test runner. This is useful when you want to prevent benchmark information from being publicly exposed.
 
-To enable masking, add the `mask` parameter to your `tests.yaml` file as a list of strings. Each string should be a key whose value you want to mask without any kind of delimiter.
+To enbable masking, add the `mask` parameter to your `tests.yaml` file as a list of strings. Each string should be a key whose value you want to mask without any kind of delimiter.
 
-By default, masking is not enabled. To enable masking, use the `-m` flag when running the test runner application.
+By default, masking is enabled. To disable masking, add `"mask": [false]` to your `.actions.json` file.
 
 ```bash
-python -m -f path/to/tests.yaml
+python -f path/to/tests.yaml
 ```
 
 ```bash
@@ -92,6 +93,21 @@ test:
 
 In the example above, the output will be `hello:***`
 
+#### Performance Thresholds
+
+You can utilize performance thresholds stored in another github repository by providing the `PERF_REPO` environment variable in GitHub's `org-name/repo-name` format.
+
+```yaml
+test:
+  cmd: "echo 'my-key: 100'"
+  performance: perf/my-model:my-test-id
+```
+
+```bash
+export PERF_REPO=...
+python test-runner/test_runner.py -f path/to/tests.yaml
+```
+
 #### Notebook Test
 
 A notebook test is a special type of test designed to run Jupyter notebooks. This is indicated by setting the notebook attribute to `True` in the test definition. When a test is marked as a notebook test, the command specified in the cmd attribute is expected to be [papermill](https://github.com/nteract/papermill) command. If papermill is not already installed in the provided `image` property, then it will be installed.
@@ -139,7 +155,7 @@ For more options, see the `--help` output below:
 
 ```text
 $ python test_runner.py --help
-usage: test_runner.py [-h] [-a ACTIONS_PATH] -f FILE_PATH [-v] [-l LOGS_PATH] [-m]
+usage: test_runner.py [-h] [-a ACTIONS_PATH] -f FILE_PATH [-v] [-l LOGS_PATH]
 
 optional arguments:
   -h, --help            show this help message and exit
@@ -150,7 +166,6 @@ optional arguments:
   -v, --verbose         DEBUG Loglevel
   -l LOGS_PATH, --logs LOGS_PATH
                         -l /path/to/logs
-  -m, --mask            Enable mask parameter for sensitive information in logs
 ```
 
 ### Run Modes
diff --git a/test-runner/dev-requirements.txt b/test-runner/dev-requirements.txt
@@ -1,7 +1,9 @@
 black>=24.4.1
 coverage>=7.5.0
 expandvars>=0.12.0
+gitpython>=3.1.43
 hypothesis>=6.100.1
+Pint>=0.21.1
 pydantic==2.7.2
 pylint>=3.1.0
 pytest>=8.1.1
diff --git a/test-runner/requirements.txt b/test-runner/requirements.txt
@@ -1,4 +1,6 @@
 expandvars>=0.12.0
+gitpython>=3.1.43
+Pint>=0.21.1
 pydantic==2.7.2
 python_on_whales>=0.70.1
 pyyaml>=6.0.1
diff --git a/test-runner/test_runner.py b/test-runner/test_runner.py
@@ -38,7 +38,7 @@
 from expandvars import expandvars
 from python_on_whales import DockerException, docker
 from tabulate import tabulate
-from utils.test import Test
+from utils.test import PerfException, Test
 from yaml import YAMLError, full_load
 
 
@@ -187,7 +187,7 @@ def get_test_list(args: dict, tests_yaml: List[dict]):
         # returns the stdout of the test and the RETURNCODE
         try:  # Try for Runtime Failure Conditions
             log = test.container_run() if test.img else test.run()
-        except DockerException as err:
+        except (DockerException, PerfException, YAMLError) as err:
             logging.error(err)
             summary.append([idx + 1, test.name, "FAIL"])
             ERROR = True
diff --git a/test-runner/tests.yaml b/test-runner/tests.yaml
@@ -15,17 +15,17 @@
 test1:
   img: ${REGISTRY}/${REPO}:latest # substitute env from host
   cmd: head -n 1 /workspace/test-runner/requirements.txt # volume mounted file
-# device: /dev/dri
-# ipc: host
+  # device: /dev/dri
+  # ipc: host
   notebook: True
   env:
     REGISTRY: ${REGISTRY} # substitute env from host
     DEBUG: 'true' # single quotes
   volumes:
-    - src: /tf_dataset
-      dst: /tmp
-    - src: $PWD
-      dst: /workspace
+  - src: /tf_dataset
+    dst: /tmp
+  - src: $PWD
+    dst: /workspace
 test2:
   cmd: echo -n $TEST && python -c 'print(" World", end="")' # var substitution inline
   env:
@@ -41,8 +41,13 @@ test6:
   img: ${CACHE_REGISTRY}/cache/library/python:3.11-slim-bullseye
   cmd: "echo 'hello: world'"
   mask:
-    - hello
+  - hello
 test7:
   cmd: "echo 'world: hello'"
   mask:
-    - world
+  - world
+test8:
+  cmd: "echo 'test: 123 throughput'"
+  mask:
+  - test
+  performance: perf/test.yaml:test
diff --git a/test-runner/tests/utest.py b/test-runner/tests/utest.py
@@ -21,7 +21,7 @@
 from hypothesis import given
 from hypothesis.strategies import dictionaries, text
 from test_runner import get_test_list, parse_args, set_log_filename
-from utils.test import Test
+from utils.test import PerfException, Test
 
 
 @pytest.fixture
@@ -143,6 +143,11 @@ def test_get_test_list(test_args_input, test_json_input):
             "mask": ["hello"],
         },
         "test7": {"cmd": "echo 'world: hello'", "mask": ["world"]},
+        "test8": {
+            "cmd": "echo 'test: 123 throughput'",
+            "mask": ["test"],
+            "performance": "perf/test.yaml:test",
+        },
     }
 
     test_fn, disable_masking = get_test_list(test_args_input, test_json_input)
@@ -154,9 +159,47 @@ def test_masking(test_class_input):
     "test masking."
     for test in test_class_input:
         if test.mask != [] and test.img:
-            assert ":***" in test.container_run()
+            assert ": ***" in test.container_run()
         if test.mask != [] and not test.img:
-            assert ":***" in test.run()
+            assert ": ***" in test.run()
+
+
+def test_perf_thresholds():
+    "test performance thresholds."
+    test_cases = [
+        {
+            "cmd": "echo 'test: 123 throughput'",
+            "performance": "perf/test.yaml:test",
+            "expected_output": "test: 123 throughput",
+            "should_raise_exception": False,
+        },
+        {
+            "cmd": "echo 'test: 121 throughput'",
+            "performance": "perf/test.yaml:test",
+            "should_raise_exception": True,
+        },
+        {
+            "cmd": "echo 'test: 123 millithroughput'",
+            "performance": "perf/test.yaml:test",
+            "should_raise_exception": True,
+        },
+        {
+            "cmd": "echo 'test: 125 throughput'",
+            "performance": "perf/test.yaml:not-test",
+            "should_raise_exception": True,
+        },
+    ]
+
+    for test_case in test_cases:
+        test = Test(name="test", **test_case)
+        if test_case["should_raise_exception"]:
+            try:
+                with pytest.raises(Exception, match="Failed") as exc_info:
+                    test.run()
+            except:
+                assert isinstance(exc_info.value, PerfException)
+        else:
+            assert test_case["expected_output"] in test.run()
 
 
 @given(name=text(), arguments=dictionaries(text(), text()))
diff --git a/test-runner/utils/test.py b/test-runner/utils/test.py
@@ -21,9 +21,27 @@
 from subprocess import PIPE, Popen
 from typing import Dict, List, Optional
 
+import pint
 from expandvars import expandvars
+from git import Repo
 from pydantic import BaseModel
 from python_on_whales import DockerException, docker
+from yaml import YAMLError, full_load
+
+units = pint.UnitRegistry()
+
+
+class PerfException(Exception):
+    "Constructs a PerfException class."
+
+
+class Threshold(BaseModel):
+    "Constructs a Threshold class."
+    name: str
+    modelName: str
+    boundary: float
+    lower_is_better: bool
+    unit: str
 
 
 class Volume(BaseModel):
@@ -49,12 +67,28 @@ class Test(BaseModel):
     groups_add: Optional[List[str]] = ["109", "44"]
     hostname: Optional[str] = None
     ipc: Optional[str] = None
+    performance: Optional[str] = None
     privileged: Optional[bool] = False
     pull: Optional[str] = "missing"
     user: Optional[str] = None
     shm_size: Optional[str] = None
     workdir: Optional[str] = None
 
+    def __init__(self, **data):
+        super().__init__(**data)
+        if self.performance:
+            perf_repo = os.environ.get("PERF_REPO")
+            if perf_repo:
+                if not os.path.exists("models-perf"):
+                    Repo.clone_from(
+                        f"https://github.com/{perf_repo}", "models-perf", progress=None
+                    )
+            else:
+                logging.error(
+                    "Performance mode enabled, but PERF_REPO environment variable not set"
+                )
+            units.load_definitions("./models-perf/definitions.txt")
+
     def get_path(self, name):
         """Given a filename, find that file from the users current working directory
 
@@ -171,6 +205,54 @@ def notebook_run(self, img: str):
                 load=True,
             )
 
+    def check_perf(self, content):
+        """
+        Check the performance of the test against the thresholds.
+
+        Args:
+            content (str): test output log
+
+        Raises:
+            PerfException: if the performance does not meet the target performance
+        """
+        with open(
+            f"models-perf/{self.performance.split(':')[0]}", "r", encoding="utf-8"
+        ) as file:
+            try:
+                thresholds = full_load(file)
+            except YAMLError as yaml_exc:
+                raise YAMLError(yaml_exc)
+        model_thresholds = [
+            threshold
+            for threshold in thresholds
+            if self.performance.split(":")[1] == threshold["test_id"]
+        ]
+        for threshold in model_thresholds:
+            perf = re.search(
+                rf"{threshold['key']}[:]?\s+(.\d+[\s]?.*)",
+                content,
+                re.IGNORECASE,
+            )
+            if perf:
+                if threshold["lower_is_better"]:
+                    if units.Quantity(perf.group(1)) > units.Quantity(
+                        f"{threshold['boundary']} {threshold['unit']}"
+                    ):
+                        if not self.mask:
+                            logging.info("%s: %s", threshold["key"], perf.group(1))
+                        raise PerfException(
+                            f"Performance Threshold {threshold['name']} did not meet the target performance."
+                        )
+                else:
+                    if units.Quantity(perf.group(1)) < units.Quantity(
+                        f"{threshold['boundary']} {threshold['unit']}"
+                    ):
+                        if not self.mask:
+                            logging.info("%s: %s", threshold["key"], perf.group(1))
+                        raise PerfException(
+                            f"Performance Threshold {threshold['name']} did not meet the target performance."
+                        )
+
     def container_run(self):
         """Runs the docker container.
 
@@ -235,9 +317,11 @@ def container_run(self):
             log = ""
             for _, stream_content in output_generator:
                 # All process logs will have the stream_type of stderr despite it being stdout
+                if self.performance:
+                    self.check_perf(stream_content.decode("utf-8"))
                 for item in self.mask:
                     stream_content = re.sub(
-                        rf"({item}[:=-_\s])(.*)",
+                        rf"({item}[:]?\s+)(.*)",
                         r"\1***",
                         stream_content.decode("utf-8"),
                     ).encode("utf-8")
@@ -271,14 +355,16 @@ def run(self):
         )
         try:
             stdout, stderr = p.communicate()
+            if self.performance:
+                self.check_perf(stdout.decode("utf-8"))
             for item in self.mask:
                 stdout = re.sub(
-                    rf"({item}[:=-_\s])(.*)", r"\1***", stdout.decode("utf-8")
+                    rf"({item}[:]?\s+)(.*)", r"\1***", stdout.decode("utf-8")
                 ).encode("utf-8")
             if stderr:
-                logging.error(stderr.decode("utf-8"))
+                logging.error(stderr.decode("utf-8").strip())
             if stdout:
-                logging.info("Test Output: %s", stdout.decode("utf-8"))
+                logging.info("Test Output: %s", stdout.decode("utf-8").strip())
             return stdout.decode("utf-8")
         except KeyboardInterrupt:
             os.killpg(os.getpgid(p.pid), SIGKILL)
diff --git a/tox.ini b/tox.ini
@@ -15,6 +15,7 @@ passenv = DOCKER_*
 setenv =
     CACHE_REGISTRY = {env:CACHE_REGISTRY}
     PATH = {env:PATH}:/usr/local/bin/docker
+    PERF_REPO = {env:PERF_REPO}
     PWD = {env:PWD}
     REGISTRY = {env:REGISTRY}
     REPO = {env:REPO}
@@ -52,3 +53,9 @@ python =
     3.11: py311
     3.12: py312
 parallel_show_output = true
+
+[testenv:clean]
+allowlist_externals=/bin/bash
+commands =
+    /bin/bash -c "rm -rf .coverage* models-perf"
+ignore_errors = True