NoushNabi
diff --git a/‎helm-frontend/src/components/AnnotationsDisplay.tsx
+51-22 b/‎helm-frontend/src/components/AnnotationsDisplay.tsx
+51-22
diff --git a/‎helm-frontend/src/utils/getReleaseUrl.ts
+3 b/‎helm-frontend/src/utils/getReleaseUrl.ts
+3
diff --git a/‎requirements.txt
+1 b/‎requirements.txt
+1
diff --git a/‎scripts/air_bench/export_air_bench_csv.py
+109 b/‎scripts/air_bench/export_air_bench_csv.py
+109
diff --git a/‎setup.cfg
+7 b/‎setup.cfg
+7
diff --git a/‎src/helm/benchmark/annotation/air_bench_annotator.py
+64 b/‎src/helm/benchmark/annotation/air_bench_annotator.py
+64
diff --git a/‎src/helm/benchmark/annotation/annotator_factory.py
+6 b/‎src/helm/benchmark/annotation/annotator_factory.py
+6
diff --git a/‎src/helm/benchmark/metrics/fin_qa_metrics.py
+60 b/‎src/helm/benchmark/metrics/fin_qa_metrics.py
+60
@@ -2,12 +2,59 @@ import CompletionAnnotation from "@/types/CompletionAnnotation";
 import Preview from "./Preview";
 import MediaObjectDisplay from "./MediaObjectDisplay";
 
+// TODO: This is a dirty hack to support annotations from
+// Image2Structure and AIRBench, but eventually we should make sure
+// all annotations are supported generally.
 type Props = {
   predictionAnnotations:
-    | Record<string, Array<CompletionAnnotation>>
+    | Record<
+        string,
+        Array<CompletionAnnotation> | Record<string, string | number>
+      >
     | undefined;
 };
 
+function listAnnotationDisplay(listAnnotation: Array<CompletionAnnotation>) {
+  return (
+    <div>
+      {listAnnotation.map((annotation, idx) => (
+        <div key={idx}>
+          {annotation.error && (
+            <div>
+              <h3 className="ml-1">Error</h3>
+              <Preview value={annotation["error"]} />{" "}
+            </div>
+          )}
+          {annotation.text && (
+            <div>
+              <h3 className="ml-1">Text</h3>
+              <Preview value={annotation["text"]} />{" "}
+            </div>
+          )}
+          {annotation.media_object && (
+            <MediaObjectDisplay mediaObject={annotation["media_object"]} />
+          )}
+        </div>
+      ))}
+    </div>
+  );
+}
+
+function dictAnnotationDisplay(
+  dictAnnotation: Record<string, string | number>,
+) {
+  return (
+    <div>
+      {Object.entries(dictAnnotation).map(([key, value]) => (
+        <div>
+          <h3 className="ml-1">{key}</h3>
+          <Preview value={value.toString()} />
+        </div>
+      ))}
+    </div>
+  );
+}
+
 export default function AnnotationDisplay({ predictionAnnotations }: Props) {
   return (
     <div>
@@ -17,27 +64,9 @@ export default function AnnotationDisplay({ predictionAnnotations }: Props) {
               <h3>
                 <strong>{key}</strong>
               </h3>
-              {value.map((annotation, idx) => (
-                <div key={idx}>
-                  {annotation.error && (
-                    <div>
-                      <h3 className="ml-1">Error</h3>
-                      <Preview value={annotation["error"]} />{" "}
-                    </div>
-                  )}
-                  {annotation.text && (
-                    <div>
-                      <h3 className="ml-1">Text</h3>
-                      <Preview value={annotation["text"]} />{" "}
-                    </div>
-                  )}
-                  {annotation.media_object && (
-                    <MediaObjectDisplay
-                      mediaObject={annotation["media_object"]}
-                    />
-                  )}
-                </div>
-              ))}
+              {Array.isArray(value)
+                ? listAnnotationDisplay(value)
+                : dictAnnotationDisplay(value)}
             </div>
           ))
         : null}
 
@@ -5,6 +5,9 @@ export default function getReleaseUrl(
   if (!currProjectId) {
     return "#";
   }
+  if (currProjectId === "home") {
+    return `https://crfm.stanford.edu/helm/`;
+  }
   if (!version) {
     return `https://crfm.stanford.edu/helm/${currProjectId}/latest/`;
   }
 
@@ -238,6 +238,7 @@ PyWavelets==1.4.1
 PyYAML==6.0.1
 referencing==0.35.1
 regex==2024.5.10
+reka-api==2.0.0
 requests==2.31.0
 requests-oauthlib==2.0.0
 retrying==1.3.4
 
@@ -0,0 +1,109 @@
+"""Reads all runs from the suite and writes them to the CSV folder in CSV format.
+
+EXPERIMENTAL: Not for public use.
+TEMPORARY: Delete after 2024-09-30"""
+
+import argparse
+import csv
+import os
+import re
+
+from tqdm import tqdm
+
+from helm.benchmark.adaptation.scenario_state import ScenarioState
+from helm.common.codec import from_json
+from helm.common.general import ensure_directory_exists
+
+
+class FieldNames:
+    CATEGORY_ID = "cate-idx"
+    L2_NAME = "l2-name"
+    L3_NAME = "l3-name"
+    L4_NAME = "l4-name"
+    PROMPT = "prompt"
+    RESPONSE = "response"
+    JUDGE_PROMPT = "judge_prompt"
+    SCORE_REASON = "score_reason"
+    SCORE = "score"
+
+
+def process_one(scenario_state_path: str, csv_file_path: str):
+    with open(scenario_state_path) as f:
+        scenario_state = from_json(f.read(), ScenarioState)
+
+    fieldnames = [
+        FieldNames.CATEGORY_ID,
+        FieldNames.L2_NAME,
+        FieldNames.L3_NAME,
+        FieldNames.L4_NAME,
+        FieldNames.PROMPT,
+        FieldNames.RESPONSE,
+        FieldNames.JUDGE_PROMPT,
+        FieldNames.SCORE_REASON,
+        FieldNames.SCORE,
+    ]
+    with open(csv_file_path, "w", newline="") as output_file:
+        writer = csv.DictWriter(output_file, fieldnames=fieldnames)
+        writer.writeheader()
+        for request_state in scenario_state.request_states:
+            row = {}
+            references = request_state.instance.references
+            assert len(references) == 4
+            row[FieldNames.CATEGORY_ID] = references[0].output.text
+            row[FieldNames.L2_NAME] = references[1].output.text
+            row[FieldNames.L3_NAME] = references[2].output.text
+            row[FieldNames.L4_NAME] = references[3].output.text
+            row[FieldNames.PROMPT] = request_state.request.prompt
+            assert request_state.result
+            assert len(request_state.result.completions) == 1
+            row[FieldNames.RESPONSE] = request_state.result.completions[0].text
+            assert request_state.annotations
+            row[FieldNames.JUDGE_PROMPT] = request_state.annotations["air_bench_2024"]["prompt_text"]
+            row[FieldNames.SCORE_REASON] = request_state.annotations["air_bench_2024"]["reasoning"]
+            row[FieldNames.SCORE] = request_state.annotations["air_bench_2024"]["score"]
+            writer.writerow(row)
+    print(f"Wrote {csv_file_path}")
+
+
+def process_all(suite_path: str, csv_path: str):
+    ensure_directory_exists(csv_path)
+    run_dir_names = sorted([p for p in os.listdir(suite_path) if p.startswith("air_bench_2024:")])
+    for run_dir_name in tqdm(run_dir_names, disable=None):
+        scenario_state_path = os.path.join(suite_path, run_dir_name, "scenario_state.json")
+        if not os.path.isfile(scenario_state_path):
+            continue
+        model_name_match = re.search("model=([A-Za-z0-9_-]+)", run_dir_name)
+        assert model_name_match
+        model_name = model_name_match[1]
+        csv_file_path = os.path.join(csv_path, f"{model_name}_result.csv")
+        process_one(scenario_state_path, csv_file_path)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-o",
+        "--output-path",
+        type=str,
+        help="Where the benchmarking output lives",
+        default="benchmark_output",
+    )
+    parser.add_argument(
+        "--csv-path",
+        type=str,
+        help="Name of the CSV folder.",
+        default="csv_output",
+    )
+    parser.add_argument(
+        "--suite",
+        type=str,
+        help="Name of the suite.",
+        required=True,
+    )
+    args = parser.parse_args()
+    suite_path = os.path.join(args.output_path, "runs", args.suite)
+    process_all(suite_path, args.csv_path)
+
+
+if __name__ == "__main__":
+    main()
@@ -161,11 +161,15 @@ models =
     crfm-helm[google]
     crfm-helm[mistral]
     crfm-helm[openai]
+    crfm-helm[reka]
     crfm-helm[together]
     crfm-helm[tsinghua]
     crfm-helm[yandex]
     crfm-helm[openvino]
 
+reka = 
+    reka-api~=2.0.0
+
 vlm =
     crfm-helm[openai]
 
@@ -182,6 +186,9 @@ vlm =
     scipy~=1.10
     torchvision>=0.14.1,<3.0.0
 
+    # For Reka AI
+    crfm-helm[reka]
+    
     # VLM scenarios
     crfm-helm[images]
     crfm-helm[image2structure]
 
@@ -0,0 +1,64 @@
+import datasets
+import os
+import re
+from typing import Any
+
+from helm.common.general import ensure_directory_exists
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.annotator import Annotator
+from helm.clients.auto_client import AutoClient
+from helm.common.request import Request
+
+
+class AIRBench2024Annotator(Annotator):
+    """The AIRBench 2024 autograder."""
+
+    name = "air_bench_2024"
+
+    def __init__(self, auto_client: AutoClient, file_storage_path: str):
+        self._auto_client = auto_client
+        cache_dir = os.path.join(file_storage_path, "data")
+        ensure_directory_exists(cache_dir)
+        dataset = datasets.load_dataset(
+            "yifanmai/air-bench-2024-beta", "judge_prompts", split="test", cache_dir=cache_dir
+        )
+        self._category_id_to_judge_prompt = {row["cate-idx"]: row["judge_prompt"] for row in dataset}
+        # Regex pattern is lenient to allow for typos e.g. extra whitespace
+        self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
+
+    def annotate(self, request_state: RequestState) -> Any:
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        model_input_text = request_state.request.prompt
+        model_output_text = request_state.result.completions[0].text
+        if not model_output_text.strip():
+            return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
+        category_id = request_state.instance.references[0].output.text
+        prompt_template = self._category_id_to_judge_prompt[category_id]
+        # Strip to deal with incorrectly formatted input CSV.
+        # TODO: Stop stripping after CSV is fixed.
+        annotator_prompt = prompt_template.replace("{{QUESTION}}", model_input_text).replace(
+            "{{ANSWER}}", model_output_text
+        )
+        annotator_request = Request(
+            model="openai/gpt-4o-2024-05-13",
+            model_deployment="openai/gpt-4o-2024-05-13",
+            prompt=annotator_prompt,
+            temperature=0.0,
+            max_tokens=64,
+        )
+        annotator_response = self._auto_client.make_request(annotator_request)
+        if not annotator_response.success:
+            raise Exception(f"Annotation request failed: {annotator_response.error}")
+        assert len(annotator_response.completions) == 1
+        annotator_response_text = annotator_response.completions[0].text
+        annotator_response_parts = self._pattern.search(annotator_response_text)
+        if not annotator_response_parts:
+            raise Exception(f"Malformed annotator response: {annotator_response_text}")
+        reasoning = annotator_response_parts[1].strip()
+        try:
+            score = float(annotator_response_parts[2].strip())
+        except ValueError as e:
+            raise Exception(f"Malformed annotator response: {annotator_response_text}") from e
+
+        return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}
@@ -1,6 +1,7 @@
 import os
 from typing import Any, Dict, Mapping, Optional
 
+from helm.clients.auto_client import AutoClient
 from helm.common.credentials_utils import provide_api_key
 from helm.common.cache_backend_config import CacheBackendConfig, CacheConfig
 from helm.common.hierarchical_logger import hlog
@@ -46,6 +47,11 @@ def get_annotator(self, annotator_spec: AnnotatorSpec) -> Annotator:
             provider_bindings={
                 "api_key": lambda: provide_api_key(self.credentials, annotator_name),
                 "file_storage_path": lambda: self._get_file_storage_path(annotator_name),
+                "auto_client": lambda: AutoClient(
+                    credentials=self.credentials,
+                    file_storage_path=self.file_storage_path,
+                    cache_backend_config=self.cache_backend_config,
+                ),
             },
         )
         annotator = create_object(annotator_spec)
 
@@ -0,0 +1,60 @@
+import math
+import json
+from typing import List, Union
+
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.fin_qa_metrics_helper import (  # type: ignore
+    equal_program,
+    eval_program,
+    program_tokenization,
+)
+
+
+def _get_program_accuracy(reference_program: List[str], generated_program: List[str]) -> float:
+    return 1.0 if equal_program(reference_program, generated_program) else 0.0
+
+
+def _get_execution_accuracy(reference_execution: str, generated_program: List[str], table: List[List[str]]) -> float:
+    invalid_flag: int
+    generated_result: Union[str, float]
+    invalid_flag, generated_result = eval_program(generated_program, table)
+    if invalid_flag:
+        return 0.0
+    if reference_execution == "yes" or reference_execution == "no":
+        return 1.0 if reference_execution == generated_result else 0
+    else:
+        if not isinstance(generated_result, float):
+            return 0.0
+        return 1.0 if math.isclose(float(reference_execution), generated_result) else 0
+
+
+class FinQAMetric(Metric):
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert len(request_state.instance.references) == 3
+        reference_text = request_state.instance.references[0].output.text
+        reference_program = program_tokenization(reference_text)
+        reference_execution = request_state.instance.references[1].output.text
+        table: List[List[str]] = json.loads(request_state.instance.references[2].output.text)
+
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        generated_text = request_state.result.completions[0].text.strip()
+        generated_program = program_tokenization(generated_text)
+
+        return [
+            Stat(MetricName("program_accuracy")).add(_get_program_accuracy(reference_program, generated_program)),
+            Stat(MetricName("execution_accuracy")).add(
+                _get_execution_accuracy(reference_execution, generated_program, table)
+            ),
+        ]
Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,9 @@ export default function getReleaseUrl(`
`5`	`5`	`if (!currProjectId) {`
`6`	`6`	`return "#";`
`7`	`7`	`}`
	`8`	`+ if (currProjectId === "home") {`
	`9`	+ return `https://crfm.stanford.edu/helm/`;
	`10`	`+ }`
`8`	`11`	`if (!version) {`
`9`	`12`	return `https://crfm.stanford.edu/helm/${currProjectId}/latest/`;
`10`	`13`	`}`