Skip to content

Commit 16452fa

Browse files
authored
Allow using alternate annotator models for AIR-Bench 2024 (#3468)
1 parent 46dacf0 commit 16452fa

File tree

2 files changed

+31
-6
lines changed

2 files changed

+31
-6
lines changed

src/helm/benchmark/annotation/air_bench_annotator.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import datasets
22
import os
33
import re
4-
from typing import Any
4+
from typing import Any, Optional
55

66
from helm.common.general import ensure_directory_exists
77
from helm.benchmark.adaptation.request_state import RequestState
@@ -15,7 +15,16 @@ class AIRBench2024Annotator(Annotator):
1515

1616
name = "air_bench_2024"
1717

18-
def __init__(self, auto_client: AutoClient, file_storage_path: str):
18+
_DEFAULT_MODEL = "openai/gpt-4o-2024-05-13"
19+
_DEFAULT_MODEL_DEPLOYMENT = "openai/gpt-4o-2024-05-13"
20+
21+
def __init__(
22+
self,
23+
auto_client: AutoClient,
24+
file_storage_path: str,
25+
model: Optional[str] = None,
26+
model_deployment: Optional[str] = None,
27+
):
1928
self._auto_client = auto_client
2029
cache_dir = os.path.join(file_storage_path, "data")
2130
ensure_directory_exists(cache_dir)
@@ -29,6 +38,8 @@ def __init__(self, auto_client: AutoClient, file_storage_path: str):
2938
self._category_id_to_judge_prompt = {row["cate-idx"]: row["judge_prompt"] for row in dataset}
3039
# Regex pattern is lenient to allow for typos e.g. extra whitespace
3140
self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
41+
self._model = model or self._DEFAULT_MODEL
42+
self._model_deployment = model_deployment or self._DEFAULT_MODEL_DEPLOYMENT
3243

3344
def annotate(self, request_state: RequestState) -> Any:
3445
assert request_state.result
@@ -45,8 +56,8 @@ def annotate(self, request_state: RequestState) -> Any:
4556
"{{ANSWER}}", model_output_text
4657
)
4758
annotator_request = Request(
48-
model="openai/gpt-4o-2024-05-13",
49-
model_deployment="openai/gpt-4o-2024-05-13",
59+
model=self._model,
60+
model_deployment=self._model_deployment,
5061
prompt=annotator_prompt,
5162
temperature=0.0,
5263
max_tokens=64,

src/helm/benchmark/run_specs/air_bench_run_specs.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from typing import Dict, Optional
2+
13
from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
24
from helm.benchmark.annotation.annotator import AnnotatorSpec
35
from helm.benchmark.metrics.metric import MetricSpec
@@ -6,7 +8,10 @@
68

79

810
@run_spec_function("air_bench_2024")
9-
def get_air_bench_2024_spec() -> RunSpec:
11+
def get_air_bench_2024_spec(
12+
annotator_model: Optional[str] = None, annotator_model_deployment: Optional[str] = None
13+
) -> RunSpec:
14+
run_spec_name = "air_bench_2024"
1015
adapter_spec = AdapterSpec(
1116
method=ADAPT_GENERATION,
1217
global_prefix="",
@@ -24,14 +29,23 @@ def get_air_bench_2024_spec() -> RunSpec:
2429
stop_sequences=[],
2530
)
2631
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.air_bench_scenario.AIRBench2024Scenario")
32+
annotator_args: Dict[str, str] = {}
33+
if annotator_model:
34+
annotator_args["model"] = annotator_model
35+
annotator_args["model_deployment"] = annotator_model_deployment or annotator_model
36+
run_spec_name = (
37+
"air_bench_2024:"
38+
f"annotator_model={annotator_args['model']},"
39+
f"annotator_model_deployment={annotator_args['model_deployment']}"
40+
)
2741
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.air_bench_annotator.AIRBench2024Annotator")]
2842
metric_specs = [
2943
MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024ScoreMetric"),
3044
MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024BasicGenerationMetric"),
3145
MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
3246
]
3347
return RunSpec(
34-
name="air_bench_2024",
48+
name=run_spec_name,
3549
scenario_spec=scenario_spec,
3650
adapter_spec=adapter_spec,
3751
metric_specs=metric_specs,

0 commit comments

Comments
 (0)