opensearch-project
diff --git a/‎.github/workflows/docker-test.yml
+41 b/‎.github/workflows/docker-test.yml
+41
diff --git a/‎.github/workflows/docker.yml
-42 b/‎.github/workflows/docker.yml
-42
diff --git a/‎.github/workflows/manual-integ.yml
+13 b/‎.github/workflows/manual-integ.yml
+13
diff --git a/‎DEVELOPER_GUIDE.md
+2 b/‎DEVELOPER_GUIDE.md
+2
diff --git a/‎README.md
+6 b/‎README.md
+6
diff --git a/‎docker/Dockerfile
+16-8 b/‎docker/Dockerfile
+16-8
diff --git a/‎docker/Dockerfile-development
-75 b/‎docker/Dockerfile-development
-75
diff --git a/‎docker/docker-compose-tests.yml
-37 b/‎docker/docker-compose-tests.yml
-37
diff --git a/‎osbenchmark/builder/utils/template_renderer.py
+3 b/‎osbenchmark/builder/utils/template_renderer.py
+3
diff --git a/‎osbenchmark/metrics.py
+41-2 b/‎osbenchmark/metrics.py
+41-2
@@ -0,0 +1,41 @@
+name: Docker Build and Test
+on:
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      logLevel:
+        description: Log level
+        required: true
+        default: warning
+        type: choice
+        options:
+          - info
+          - warning
+          - debug
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        platform: ['linux/amd64', 'linux/arm64']
+    steps:
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+        with:
+          version: 'v0.9.1'
+      - uses: actions/checkout@v4
+        with:
+          path: 'opensearch-benchmark-git'
+      - name: Docker Build ${{ matrix.platform }}
+        run: |
+            docker buildx version
+            cp -a opensearch-benchmark-git/* ./
+            echo "Disable VERSION arg to enter docker build test mode"
+            PLATFORM=${{ matrix.platform }}
+            PLATFORM=`echo $PLATFORM | tr '/' '-'`
+            docker buildx build --platform ${{ matrix.platform }} --build-arg BUILD_ENV=testing --build-arg BUILD_DATE=`date -u +%Y-%m-%dT%H:%M:%SZ` -f "docker/Dockerfile" -t "osb/osb-$PLATFORM" -o type=docker .
+            docker images | grep "osb/osb-$PLATFORM"
@@ -14,6 +14,19 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
+      - uses: KengoTODA/actions-setup-docker-compose@v1
+        with:
+          version: '1.29.2'
+#      - name: Enforce docker-compose v1
+#        run: |
+#          echo "GitHub starts to switch runners to include docker-compose v2"
+#          echo "which uses 'docker compose' command to replace 'docker-compose'"
+#          echo "this would cause issues in our test validation so we enforce v1 here"
+#          echo "https://github.com/actions/runner-images/commit/2a4bc14da46f1f8e358aa902a69edb9bef135472"
+#          sudo apt-get remove -y docker-compose-plugin
+#          sudo pip install docker-compose==1.29.2
+#          docker --version
+#          docker-compose --version
       - name: Check out repository code
         uses: actions/checkout@v2
       - name: Clone pyenv
 
@@ -173,6 +173,8 @@ Integration tests are expected to run for approximately **20-30 mins** and can b
   * Amazon Linux 2
   * MacOS
 
+Integration tests run against the standard [OpenSearch Benchmark workloads](https://github.com/opensearch-project/opensearch-benchmark-workloads).  Sometimes, it may be necessary to run integration tests against a modified forked copy of these workloads.  In that case, please follow the instructions [here](https://github.com/opensearch-project/opensearch-benchmark-workloads/blob/main/README.md#testing-the-workload).
+
 Invoke integration tests by running the following command within the root directory of the repository:
 
 ```
 
@@ -1,3 +1,9 @@
+[![CI](https://github.com/opensearch-project/opensearch-benchmark/actions/workflows/main.yml/badge.svg)](https://github.com/opensearch-project/opensearch-benchmark/actions/workflows/main.yml)
+[![Integration](https://github.com/opensearch-project/opensearch-benchmark/actions/workflows/manual-integ.yml/badge.svg)](https://github.com/opensearch-project/opensearch-benchmark/actions/workflows/manual-integ.yml)
+[![Release](https://github.com/opensearch-project/opensearch-benchmark/actions/workflows/release-drafter.yml/badge.svg)](https://github.com/opensearch-project/opensearch-benchmark/actions/workflows/release-drafter.yml)
+[![Chat](https://img.shields.io/badge/chat-on%20forums-blue)](https://forum.opensearch.org/categories)
+![PRs welcome!](https://img.shields.io/badge/PRs-welcome!-success)
+
 <img src="https://github.com/opensearch-project/opensearch-benchmark/blob/main/opensearch_benchmark.png?raw=true"  height="64px" alt="OpenSearch Benchmark">
 
 OpenSearch Benchmark is the macrobenchmarking framework for OpenSearch.
 
@@ -1,11 +1,19 @@
-###############################################################################
-# Install OpenSearch Benchmark from PyPI to build a Docker image
-###############################################################################
+########################################################
+# Install OpenSearch Benchmark to build a Docker image #
+########################################################
 
-FROM python:3.11.2-slim
 ARG VERSION
+ARG BUILD_ENV=production
 
-ENV BENCHMARK_RUNNING_IN_DOCKER True
+FROM python:3.11.2-slim as build_env_testing
+ONBUILD COPY opensearch-benchmark-git/ ./
+
+FROM python:3.11.2-slim as build_env_production
+ONBUILD RUN echo Production Environment
+
+FROM build_env_${BUILD_ENV}
+WORKDIR /opensearch-benchmark
+ENV BENCHMARK_RUNNING_IN_DOCKER=True
 
 RUN apt-get -y update && \
     apt-get install -y curl git gcc pbzip2 pigz && \
@@ -15,9 +23,9 @@ RUN apt-get -y update && \
 RUN groupadd --gid 1000 opensearch-benchmark && \
     useradd -d /opensearch-benchmark -m -k /dev/null -g 1000 -N -u 1000 -l -s /bin/bash benchmark
 
-RUN if [ -z "$VERSION" ] ; then python3 -m pip install opensearch-benchmark ; else python3 -m pip install opensearch-benchmark==$VERSION ; fi
-
-WORKDIR /opensearch-benchmark
+ENV PIP_ONLY_BINARY=h5py
+RUN if [ "$BUILD_ENV" = "testing" ] ; then echo Testing; ls -l; python3 -m pip install -e . ; \
+    else echo Production; if [ -z "$VERSION" ] ; then python3 -m pip install opensearch-benchmark ; else python3 -m pip install opensearch-benchmark==$VERSION ; fi; fi
 
 RUN mkdir -p /opensearch-benchmark/.benchmark && \
     chown -R 1000:0 /opensearch-benchmark/.benchmark
 
@@ -3,6 +3,7 @@
 
 from osbenchmark.exceptions import InvalidSyntax, SystemSetupError
 from osbenchmark.utils import io
+from osbenchmark.workload import loader
 
 
 class TemplateRenderer:
@@ -11,6 +12,7 @@ def render_template_file(self, root_path, variables, file_name):
 
     def _render_template_file(self, root_path, variables, file_name):
         env = jinja2.Environment(loader=jinja2.FileSystemLoader(root_path), autoescape=select_autoescape(['html', 'xml']))
+        env.filters["version_between"] = loader.version_between
         template = env.get_template(io.basename(file_name))
         # force a new line at the end. Jinja seems to remove it.
         return template.render(variables) + "\n"
@@ -20,6 +22,7 @@ def render_template_string(self, template_string, variables):
 
     def _render_template_string(self, template_string, variables):
         env = jinja2.Environment(loader=jinja2.BaseLoader, autoescape=select_autoescape(['html', 'xml']))
+        env.filters["version_between"] = loader.version_between
         template = env.from_string(template_string)
 
         return template.render(variables)
 
@@ -1440,7 +1440,6 @@ def as_dict(self):
         if self.plugin_params:
             d["plugin-params"] = self.plugin_params
         return d
-
     def to_result_dicts(self):
         """
         :return: a list of dicts, suitable for persisting the results of this test execution in a format that is Kibana-friendly.
@@ -1784,6 +1783,7 @@ def __call__(self):
                 op_type = task.operation.type
                 error_rate = self.error_rate(t, op_type)
                 duration = self.duration(t)
+
                 if task.operation.include_in_results_publishing or error_rate > 0:
                     self.logger.debug("Gathering request metrics for [%s].", t)
                     result.add_op_metrics(
@@ -1800,8 +1800,19 @@ def __call__(self):
                             self.workload.meta_data,
                             self.test_procedure.meta_data,
                             task.operation.meta_data,
-                            task.meta_data)
+                            task.meta_data,
+                        ),
+                    )
+
+                    result.add_correctness_metrics(
+                        t,
+                        task.operation.name,
+                        self.single_latency(t, op_type, metric_name="recall@k"),
+                        self.single_latency(t, op_type, metric_name="recall@1"),
+                        error_rate,
+                        duration,
                     )
+
         self.logger.debug("Gathering indexing metrics.")
         result.total_time = self.sum("indexing_total_time")
         result.total_time_per_shard = self.shard_stats("indexing_total_time")
@@ -1996,6 +2007,7 @@ def single_latency(self, task, operation_type, metric_name="latency"):
 class GlobalStats:
     def __init__(self, d=None):
         self.op_metrics = self.v(d, "op_metrics", default=[])
+        self.correctness_metrics = self.v(d, "correctness_metrics", default=[])
         self.total_time = self.v(d, "total_time")
         self.total_time_per_shard = self.v(d, "total_time_per_shard", default={})
         self.indexing_throttle_time = self.v(d, "indexing_throttle_time")
@@ -2081,6 +2093,22 @@ def op_metrics(op_item, key, single_value=False):
                             "max": item["max"]
                         }
                     })
+            elif metric == "correctness_metrics":
+                for item in value:
+                    if "recall@k" in item:
+                        all_results.append({
+                            "task": item["task"],
+                            "operation": item["operation"],
+                            "name": "recall@k",
+                            "value": item["recall@k"]
+                        })
+                    if "recall@1" in item:
+                        all_results.append({
+                            "task": item["task"],
+                            "operation": item["operation"],
+                            "name": "recall@1",
+                            "value": item["recall@1"]
+                        })
             elif metric.startswith("total_transform_") and value is not None:
                 for item in value:
                     all_results.append({
@@ -2124,6 +2152,17 @@ def add_op_metrics(self, task, operation, throughput, latency, service_time, cli
             doc["meta"] = meta
         self.op_metrics.append(doc)
 
+    def add_correctness_metrics(self, task, operation, recall_at_k_stats, recall_at_1_stats, error_rate, duration):
+        self.correctness_metrics.append({
+            "task": task,
+            "operation": operation,
+            "recall@k": recall_at_k_stats,
+            "recall@1":recall_at_1_stats,
+            "error_rate": error_rate,
+            "duration": duration
+            }
+            )
+
     def tasks(self):
         # ensure we can read test_execution.json files before Benchmark 0.8.0
         return [v.get("task", v["operation"]) for v in self.op_metrics]