Skip to content

Commit c202b15

Browse files
author
Yuheng Tu
committed
add document
1 parent 1e1ce9f commit c202b15

File tree

3 files changed

+115
-20
lines changed

3 files changed

+115
-20
lines changed

docs/reeval.md

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Reliable and Efficient Amortized Model-based Evaluation
2+
3+
Reliable and Efficient Amortized Model-based Evaluation (Reeval) is an extension of the HELM framework for using Computerized Adaptive Testing (CAT) within the framework of Item Response Theory (IRT) to adaptively evaluate Large Language Models (LLMs). This approach selects the next question whose difficulty is closest to the estimated model ability, thereby reliably and efficiently eliciting the model's ability. The difficulties of the questions are provided on HuggingFace: [`stair-lab/reeval-difficulty-for-helm`](https://huggingface.co/datasets/stair-lab/reeval-difficulty-for-helm), which currently supports 22 scenarios in HELM. The paper's authors will supply a Python package for calculating these difficulties and will support more scenarios in the future.
4+
5+
# References
6+
7+
[Paper](https://arxiv.org/abs/2503.13335)
8+
9+
# Getting Started
10+
11+
Use Git to clone the `stanford-crfm` repository and set up the environment as a developer as in [Developer Setup](https://crfm-helm.readthedocs.io/en/latest/developer_setup/):
12+
13+
The following is an example of adaptively evaluating Openai GPT2 on the MMLU scenario using 50 instances. The argument `--model-ability` is the initial ability of the model for reeval evaluation. The argument `--max-samples` is the maximum number of samples to evaluate in the reeval mode. The argument `--metric-name` is the main metric name for the scenario. Note that the reeval mode does not support the argument `--max-eval-instances`, because it can potentially contradict the argument `--max-samples`. Other arguments stay the same as HELM.
14+
15+
```
16+
# Run benchmark
17+
cd src
18+
export SUITE_NAME=reeval_mmlu_openai_gpt2
19+
export MODELS_TO_RUN=openai/gpt2
20+
export RUN_ENTRIES_CONF_PATH=helm/benchmark/presentation/run_entries_mmlu.conf
21+
export SCHEMA_PATH=schema_mmlu.yaml
22+
export NUM_TRAIN_TRIALS=1
23+
export PRIORITY=4
24+
export MODEL_ABILITY=0.0
25+
export MAX_SAMPLES=50
26+
export METRIC_NAME=exact_match
27+
python3 -m helm.benchmark.reeval_run --conf-paths $RUN_ENTRIES_CONF_PATH --num-train-trials $NUM_TRAIN_TRIALS --priority $PRIORITY --suite $SUITE_NAME --models-to-run $MODELS_TO_RUN --runner-class-name helm.benchmark.reeval_runner.RelEffEvalRunner --model-ability $MODEL_ABILITY --max-samples $MAX_SAMPLES --metric-name $METRIC_NAME
28+
29+
# Summarize benchmark results
30+
helm-summarize --schema $SCHEMA_PATH --suite $SUITE_NAME
31+
32+
# Start a web server to display benchmark results
33+
helm-server --suite $SUITE_NAME
34+
```
35+
36+
Then go to http://localhost:8000/ in your browser.

src/helm/benchmark/reeval_run.py

+70-11
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
from helm.common.reeval_parameters import ReevalParameters
2222
from helm.benchmark.run import (
2323
run_benchmarking,
24-
add_run_args,
2524
validate_args,
2625
)
2726

@@ -33,6 +32,9 @@ def run_entries_to_run_specs(
3332
models_to_run: Optional[List[str]] = None,
3433
groups_to_run: Optional[List[str]] = None,
3534
priority: Optional[int] = None,
35+
model_ability: Optional[float] = None,
36+
max_samples: Optional[int] = None,
37+
metric_name: Optional[str] = None,
3638
) -> List[RunSpec]:
3739
"""Runs RunSpecs given a list of RunSpec descriptions."""
3840
run_specs: List[RunSpec] = []
@@ -63,7 +65,11 @@ def run_entries_to_run_specs(
6365
# Add reeval_parameters
6466
adapter_spec = replace(
6567
adapter_spec,
66-
reeval_parameters=ReevalParameters(model_ability=0.0, max_samples=50, metric_name="exact_match"),
68+
reeval_parameters=ReevalParameters(
69+
model_ability=model_ability,
70+
max_samples=max_samples,
71+
metric_name=metric_name,
72+
),
6773
)
6874

6975
run_spec = replace(run_spec, adapter_spec=adapter_spec)
@@ -78,6 +84,64 @@ def run_entries_to_run_specs(
7884
return run_specs
7985

8086

87+
def add_run_args(parser: argparse.ArgumentParser):
88+
parser.add_argument(
89+
"-o", "--output-path", type=str, help="Where to save all the output", default="benchmark_output"
90+
)
91+
parser.add_argument("-n", "--num-threads", type=int, help="Max number of threads to make requests", default=4)
92+
parser.add_argument(
93+
"--skip-instances",
94+
action="store_true",
95+
help="Skip creation of instances (basically do nothing but just parse everything).",
96+
)
97+
parser.add_argument(
98+
"--cache-instances",
99+
action="store_true",
100+
help="Save generated instances input to model to disk. If already cached, read instances from file.",
101+
)
102+
parser.add_argument(
103+
"--cache-instances-only",
104+
action="store_true",
105+
help="Generate and save instances for scenario ONLY (i.e. do not evaluate models on instances).",
106+
)
107+
parser.add_argument(
108+
"-d",
109+
"--dry-run",
110+
action="store_true",
111+
help="Skip execution, only output scenario states and estimate token usage.",
112+
)
113+
parser.add_argument(
114+
"-t",
115+
"--num-train-trials",
116+
type=int,
117+
help="Number of trials where each trial samples a different set of in-context examples. "
118+
"Overrides the value in Adapter spec.",
119+
)
120+
parser.add_argument(
121+
"--suite",
122+
type=str,
123+
help="Name of the suite this run belongs to (default is today's date).",
124+
required=True,
125+
)
126+
parser.add_argument(
127+
"--local-path",
128+
type=str,
129+
help="If running locally, the path for `ServerService`.",
130+
default="prod_env",
131+
)
132+
parser.add_argument(
133+
"--mongo-uri",
134+
type=str,
135+
help="If non-empty, the URL of the MongoDB database that will be used for caching instead of SQLite",
136+
default="",
137+
)
138+
parser.add_argument(
139+
"--disable-cache",
140+
action="store_true",
141+
help="If true, the request-response cache for model clients and tokenizers will be disabled.",
142+
)
143+
144+
81145
@htrack(None)
82146
def main():
83147
parser = argparse.ArgumentParser()
@@ -150,7 +214,7 @@ def main():
150214
"--model-ability",
151215
type=float,
152216
default=0.0,
153-
help="The inital ability of the model for reeval evaluation.",
217+
help="The initial ability of the model for reeval evaluation.",
154218
)
155219
parser.add_argument(
156220
"--max-samples",
@@ -167,13 +231,6 @@ def main():
167231
add_run_args(parser)
168232
args = parser.parse_args()
169233
validate_args(args)
170-
171-
if args.max_eval_instances:
172-
hlog(
173-
"WARNING: In reeval mode, max-eval-instances will not be used to downsample the evaluation instances. "
174-
"Use --max-samples"
175-
)
176-
177234
register_builtin_configs_from_helm_package()
178235
register_configs_from_directory(args.local_path)
179236

@@ -222,11 +279,13 @@ def main():
222279

223280
run_specs = run_entries_to_run_specs(
224281
run_entries=run_entries,
225-
max_eval_instances=args.max_eval_instances,
226282
num_train_trials=args.num_train_trials,
227283
models_to_run=args.models_to_run,
228284
groups_to_run=args.groups_to_run,
229285
priority=args.priority,
286+
model_ability=args.model_ability,
287+
max_samples=args.max_samples,
288+
metric_name=args.metric_name,
230289
)
231290
hlog(f"{len(run_entries)} entries produced {len(run_specs)} run specs")
232291

src/helm/benchmark/reeval_runner.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ class RelEffEvalRunner(Runner):
4848
the authors of the paper will supply a Python package for calculating these difficulties.
4949
At each iteration, the runner estimates the model's ability based on all previously
5050
administered questions and their corresponding responses. It then selects the next question
51-
whose difficulty is closest to the estimated ability, thereby optimally eliciting the
52-
model's ability.
51+
whose difficulty is closest to the estimated ability, thereby reliably and efficiently
52+
eliciting the model's ability.
5353
"""
5454

5555
def __init__(
@@ -184,12 +184,12 @@ def run_one(self, run_spec: RunSpec):
184184
# model_ability = run_spec.adapter_spec.reeval_parameters.model_ability
185185
# scenario_metric_name = run_spec.adapter_spec.reeval_parameters.metric_name
186186
# max_samples = run_spec.adapter_spec.reeval_parameters.max_samples
187-
if run_spec.adapter_spec.reeval_parameters:
188-
if run_spec.adapter_spec.reeval_parameters.model_ability:
187+
if run_spec.adapter_spec.reeval_parameters is not None:
188+
if run_spec.adapter_spec.reeval_parameters.model_ability is not None:
189189
model_ability = run_spec.adapter_spec.reeval_parameters.model_ability
190-
if run_spec.adapter_spec.reeval_parameters.metric_name:
190+
if run_spec.adapter_spec.reeval_parameters.metric_name is not None:
191191
scenario_metric_name = run_spec.adapter_spec.reeval_parameters.metric_name
192-
if run_spec.adapter_spec.reeval_parameters.max_samples:
192+
if run_spec.adapter_spec.reeval_parameters.max_samples is not None:
193193
max_samples = run_spec.adapter_spec.reeval_parameters.max_samples
194194

195195
asked_request_states: List[RequestState] = []
@@ -213,7 +213,7 @@ def run_one(self, run_spec: RunSpec):
213213
selected_item = None
214214
min_diff = float("inf")
215215
for item in unasked_request_states:
216-
assert item.instance.extra_data
216+
assert item.instance.extra_data is not None
217217
diff = abs(item.instance.extra_data["difficulty"] - model_ability)
218218
if diff < min_diff:
219219
min_diff = diff
@@ -276,9 +276,9 @@ def run_one(self, run_spec: RunSpec):
276276
].mean
277277

278278
# TODO: look for better way to fix the type-checker error
279-
assert scenario_metric_value
279+
assert scenario_metric_value is not None
280280
reeval_trajectory["response_correctness"].append(scenario_metric_value)
281-
assert selected_item.instance.extra_data
281+
assert selected_item.instance.extra_data is not None
282282
reeval_trajectory["instance_difficulties"].append(selected_item.instance.extra_data["difficulty"])
283283

284284
# Estimate the model ability

0 commit comments

Comments
 (0)