add document

Yuheng Tu · Yuheng Tu · commit c202b15df38e · 2025-03-21T10:28:33.000-07:00
diff --git a/docs/reeval.md b/docs/reeval.md
@@ -0,0 +1,36 @@
+# Reliable and Efficient Amortized Model-based Evaluation
+
+Reliable and Efficient Amortized Model-based Evaluation (Reeval) is an extension of the HELM framework for using Computerized Adaptive Testing (CAT) within the framework of Item Response Theory (IRT) to adaptively evaluate Large Language Models (LLMs). This approach selects the next question whose difficulty is closest to the estimated model ability, thereby reliably and efficiently eliciting the model's ability. The difficulties of the questions are provided on HuggingFace: [`stair-lab/reeval-difficulty-for-helm`](https://huggingface.co/datasets/stair-lab/reeval-difficulty-for-helm), which currently supports 22 scenarios in HELM. The paper's authors will supply a Python package for calculating these difficulties and will support more scenarios in the future.
+
+# References
+
+[Paper](https://arxiv.org/abs/2503.13335)
+
+# Getting Started
+
+Use Git to clone the `stanford-crfm` repository and set up the environment as a developer as in [Developer Setup](https://crfm-helm.readthedocs.io/en/latest/developer_setup/):
+
+The following is an example of adaptively evaluating Openai GPT2 on the MMLU scenario using 50 instances. The argument `--model-ability` is the initial ability of the model for reeval evaluation. The argument `--max-samples` is the maximum number of samples to evaluate in the reeval mode. The argument `--metric-name` is the main metric name for the scenario. Note that the reeval mode does not support the argument `--max-eval-instances`, because it can potentially contradict the argument `--max-samples`. Other arguments stay the same as HELM.
+
+```
+# Run benchmark
+cd src
+export SUITE_NAME=reeval_mmlu_openai_gpt2
+export MODELS_TO_RUN=openai/gpt2
+export RUN_ENTRIES_CONF_PATH=helm/benchmark/presentation/run_entries_mmlu.conf
+export SCHEMA_PATH=schema_mmlu.yaml
+export NUM_TRAIN_TRIALS=1
+export PRIORITY=4
+export MODEL_ABILITY=0.0
+export MAX_SAMPLES=50
+export METRIC_NAME=exact_match
+python3 -m helm.benchmark.reeval_run --conf-paths $RUN_ENTRIES_CONF_PATH --num-train-trials $NUM_TRAIN_TRIALS --priority $PRIORITY --suite $SUITE_NAME --models-to-run $MODELS_TO_RUN --runner-class-name helm.benchmark.reeval_runner.RelEffEvalRunner --model-ability $MODEL_ABILITY --max-samples $MAX_SAMPLES --metric-name $METRIC_NAME
+
+# Summarize benchmark results
+helm-summarize --schema $SCHEMA_PATH --suite $SUITE_NAME
+
+# Start a web server to display benchmark results
+helm-server --suite $SUITE_NAME
+```
+
+Then go to http://localhost:8000/ in your browser.
diff --git a/src/helm/benchmark/reeval_run.py b/src/helm/benchmark/reeval_run.py
@@ -21,7 +21,6 @@
 from helm.common.reeval_parameters import ReevalParameters
 from helm.benchmark.run import (
     run_benchmarking,
-    add_run_args,
     validate_args,
 )
 
@@ -33,6 +32,9 @@ def run_entries_to_run_specs(
     models_to_run: Optional[List[str]] = None,
     groups_to_run: Optional[List[str]] = None,
     priority: Optional[int] = None,
+    model_ability: Optional[float] = None,
+    max_samples: Optional[int] = None,
+    metric_name: Optional[str] = None,
 ) -> List[RunSpec]:
     """Runs RunSpecs given a list of RunSpec descriptions."""
     run_specs: List[RunSpec] = []
@@ -63,7 +65,11 @@ def run_entries_to_run_specs(
             # Add reeval_parameters
             adapter_spec = replace(
                 adapter_spec,
-                reeval_parameters=ReevalParameters(model_ability=0.0, max_samples=50, metric_name="exact_match"),
+                reeval_parameters=ReevalParameters(
+                    model_ability=model_ability,
+                    max_samples=max_samples,
+                    metric_name=metric_name,
+                ),
             )
 
             run_spec = replace(run_spec, adapter_spec=adapter_spec)
@@ -78,6 +84,64 @@ def run_entries_to_run_specs(
     return run_specs
 
 
+def add_run_args(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "-o", "--output-path", type=str, help="Where to save all the output", default="benchmark_output"
+    )
+    parser.add_argument("-n", "--num-threads", type=int, help="Max number of threads to make requests", default=4)
+    parser.add_argument(
+        "--skip-instances",
+        action="store_true",
+        help="Skip creation of instances (basically do nothing but just parse everything).",
+    )
+    parser.add_argument(
+        "--cache-instances",
+        action="store_true",
+        help="Save generated instances input to model to disk. If already cached, read instances from file.",
+    )
+    parser.add_argument(
+        "--cache-instances-only",
+        action="store_true",
+        help="Generate and save instances for scenario ONLY (i.e. do not evaluate models on instances).",
+    )
+    parser.add_argument(
+        "-d",
+        "--dry-run",
+        action="store_true",
+        help="Skip execution, only output scenario states and estimate token usage.",
+    )
+    parser.add_argument(
+        "-t",
+        "--num-train-trials",
+        type=int,
+        help="Number of trials where each trial samples a different set of in-context examples. "
+        "Overrides the value in Adapter spec.",
+    )
+    parser.add_argument(
+        "--suite",
+        type=str,
+        help="Name of the suite this run belongs to (default is today's date).",
+        required=True,
+    )
+    parser.add_argument(
+        "--local-path",
+        type=str,
+        help="If running locally, the path for `ServerService`.",
+        default="prod_env",
+    )
+    parser.add_argument(
+        "--mongo-uri",
+        type=str,
+        help="If non-empty, the URL of the MongoDB database that will be used for caching instead of SQLite",
+        default="",
+    )
+    parser.add_argument(
+        "--disable-cache",
+        action="store_true",
+        help="If true, the request-response cache for model clients and tokenizers will be disabled.",
+    )
+
+
 @htrack(None)
 def main():
     parser = argparse.ArgumentParser()
@@ -150,7 +214,7 @@ def main():
         "--model-ability",
         type=float,
         default=0.0,
-        help="The inital ability of the model for reeval evaluation.",
+        help="The initial ability of the model for reeval evaluation.",
     )
     parser.add_argument(
         "--max-samples",
@@ -167,13 +231,6 @@ def main():
     add_run_args(parser)
     args = parser.parse_args()
     validate_args(args)
-
-    if args.max_eval_instances:
-        hlog(
-            "WARNING: In reeval mode, max-eval-instances will not be used to downsample the evaluation instances. "
-            "Use --max-samples"
-        )
-
     register_builtin_configs_from_helm_package()
     register_configs_from_directory(args.local_path)
 
@@ -222,11 +279,13 @@ def main():
 
     run_specs = run_entries_to_run_specs(
         run_entries=run_entries,
-        max_eval_instances=args.max_eval_instances,
         num_train_trials=args.num_train_trials,
         models_to_run=args.models_to_run,
         groups_to_run=args.groups_to_run,
         priority=args.priority,
+        model_ability=args.model_ability,
+        max_samples=args.max_samples,
+        metric_name=args.metric_name,
     )
     hlog(f"{len(run_entries)} entries produced {len(run_specs)} run specs")
 
diff --git a/src/helm/benchmark/reeval_runner.py b/src/helm/benchmark/reeval_runner.py
@@ -48,8 +48,8 @@ class RelEffEvalRunner(Runner):
     the authors of the paper will supply a Python package for calculating these difficulties.
     At each iteration, the runner estimates the model's ability based on all previously
     administered questions and their corresponding responses. It then selects the next question
-    whose difficulty is closest to the estimated ability, thereby optimally eliciting the
-    model's ability.
+    whose difficulty is closest to the estimated ability, thereby reliably and efficiently
+    eliciting the model's ability.
     """
 
     def __init__(
@@ -184,12 +184,12 @@ def run_one(self, run_spec: RunSpec):
         # model_ability = run_spec.adapter_spec.reeval_parameters.model_ability
         # scenario_metric_name = run_spec.adapter_spec.reeval_parameters.metric_name
         # max_samples = run_spec.adapter_spec.reeval_parameters.max_samples
-        if run_spec.adapter_spec.reeval_parameters:
-            if run_spec.adapter_spec.reeval_parameters.model_ability:
+        if run_spec.adapter_spec.reeval_parameters is not None:
+            if run_spec.adapter_spec.reeval_parameters.model_ability is not None:
                 model_ability = run_spec.adapter_spec.reeval_parameters.model_ability
-            if run_spec.adapter_spec.reeval_parameters.metric_name:
+            if run_spec.adapter_spec.reeval_parameters.metric_name is not None:
                 scenario_metric_name = run_spec.adapter_spec.reeval_parameters.metric_name
-            if run_spec.adapter_spec.reeval_parameters.max_samples:
+            if run_spec.adapter_spec.reeval_parameters.max_samples is not None:
                 max_samples = run_spec.adapter_spec.reeval_parameters.max_samples
 
         asked_request_states: List[RequestState] = []
@@ -213,7 +213,7 @@ def run_one(self, run_spec: RunSpec):
             selected_item = None
             min_diff = float("inf")
             for item in unasked_request_states:
-                assert item.instance.extra_data
+                assert item.instance.extra_data is not None
                 diff = abs(item.instance.extra_data["difficulty"] - model_ability)
                 if diff < min_diff:
                     min_diff = diff
@@ -276,9 +276,9 @@ def run_one(self, run_spec: RunSpec):
             ].mean
 
             # TODO: look for better way to fix the type-checker error
-            assert scenario_metric_value
+            assert scenario_metric_value is not None
             reeval_trajectory["response_correctness"].append(scenario_metric_value)
-            assert selected_item.instance.extra_data
+            assert selected_item.instance.extra_data is not None
             reeval_trajectory["instance_difficulties"].append(selected_item.instance.extra_data["difficulty"])
 
             # Estimate the model ability