Skip to content

Commit 185bc24

Browse files
authored
Add InfiniteBenchSum scenario and run spec (#3409)
1 parent a6dcc3d commit 185bc24

File tree

3 files changed

+159
-0
lines changed

3 files changed

+159
-0
lines changed

src/helm/benchmark/run_specs/experimental_run_specs.py

+31
Original file line numberDiff line numberDiff line change
@@ -192,3 +192,34 @@ def get_czech_bank_qa_spec(config_name: str = "berka_queries_1024_2024_12_18") -
192192
annotators=[AnnotatorSpec("helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator")],
193193
groups=["czech_bank_qa"],
194194
)
195+
196+
197+
@run_spec_function("infinite_bench_sum")
198+
def get_infinite_bench_sum_spec(min_num_word: float = 0.0, max_num_word: float = 1e7) -> RunSpec:
199+
200+
scenario_spec = ScenarioSpec(
201+
class_name="helm.benchmark.scenarios.infinite_bench_sum_scenario.InfiniteBenchSumScenario",
202+
args={
203+
"min_num_word": min_num_word,
204+
"max_num_word": max_num_word,
205+
},
206+
)
207+
208+
adapter_spec = AdapterSpec(
209+
method=ADAPT_GENERATION,
210+
input_prefix="",
211+
output_prefix="",
212+
max_tokens=2000, # No official number, the average output token is 1.1k according to the paper
213+
num_outputs=1,
214+
temperature=0.0,
215+
)
216+
217+
metric_specs = get_basic_metric_specs(["rouge_l"])
218+
219+
return RunSpec(
220+
name="infinite_bench_sum",
221+
scenario_spec=scenario_spec,
222+
adapter_spec=adapter_spec,
223+
metric_specs=metric_specs,
224+
groups=["infinite_bench_sum"],
225+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import os
2+
import re
3+
from typing import List
4+
from datasets import load_dataset, Features, Value, Sequence, Dataset
5+
from helm.benchmark.scenarios.scenario import (
6+
Scenario,
7+
Instance,
8+
Input,
9+
Reference,
10+
Output,
11+
CORRECT_TAG,
12+
TEST_SPLIT,
13+
)
14+
from helm.common.general import ensure_directory_exists
15+
16+
17+
class InfiniteBenchSumScenario(Scenario):
18+
"""InfiniteBench Sum
19+
20+
InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
21+
understand, and reason over super long contexts (100k+ tokens). InfiniteBench Sum is a subset of
22+
InfiniteBench that requires models to generate a concise summary of the novel. The subset is referred
23+
to as "En.Sum" in the original paper.
24+
"""
25+
26+
name = "infinite_bench_sum"
27+
description = "Summarize a novel from InfiniteBench"
28+
tags = ["summarization"]
29+
30+
def __init__(self, min_num_word: float = 0.0, max_num_word: float = 100e6):
31+
self.min_num_word = min_num_word
32+
self.max_num_word = max_num_word
33+
super().__init__()
34+
35+
def get_instances(self, output_path: str) -> List[Instance]:
36+
# Get InfiniteBench from HuggingFace
37+
cache_dir = os.path.join(output_path, "data")
38+
ensure_directory_exists(cache_dir)
39+
40+
# Define the features schema
41+
ft = Features(
42+
{
43+
"id": Value("int64"),
44+
"context": Value("string"),
45+
"input": Value("string"),
46+
"answer": Sequence(Value("string")),
47+
"options": Sequence(Value("string")),
48+
}
49+
)
50+
51+
# Load the dataset with the specified features
52+
dataset = load_dataset(
53+
"xinrongzhang2022/InfiniteBench",
54+
split="longbook_sum_eng",
55+
features=ft,
56+
revision="90f0394333616266d9fe85824ceaf505093cbaa5",
57+
)
58+
59+
assert isinstance(dataset, Dataset)
60+
61+
def count_words(text: str) -> int:
62+
return len(re.split(r"\s+", text.strip()))
63+
64+
dataset = dataset.map(
65+
lambda example: {"prompt_wc": count_words(example["context"]) + count_words(example["input"])}
66+
).filter(lambda example: self.min_num_word <= example["prompt_wc"] <= self.max_num_word)
67+
68+
# Read all instances
69+
instances: List[Instance] = []
70+
for row in dataset:
71+
id = row["id"]
72+
input = Input(text=row["context"] + "\n\n" + row["input"])
73+
instance = Instance(
74+
id=id,
75+
input=input,
76+
references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
77+
split=TEST_SPLIT,
78+
extra_data={"word_count": row["prompt_wc"]},
79+
)
80+
instances.append(instance)
81+
82+
return instances
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import pytest
2+
import re
3+
from tempfile import TemporaryDirectory
4+
from helm.benchmark.scenarios.infinite_bench_sum_scenario import InfiniteBenchSumScenario
5+
from helm.benchmark.scenarios.scenario import CORRECT_TAG
6+
7+
8+
def count_words(text: str) -> int:
9+
return len(re.split(r"\s+", text.strip()))
10+
11+
12+
@pytest.mark.scenarios
13+
def test_infinite_bench_sum_scenario():
14+
with TemporaryDirectory() as tmpdir:
15+
scenario = InfiniteBenchSumScenario(min_num_word=0, max_num_word=1e7)
16+
instances = scenario.get_instances(tmpdir)
17+
assert len(instances) == 103
18+
assert instances[0].split == "test"
19+
assert len(instances[0].input.text) == 1745528
20+
assert instances[0].extra_data
21+
assert instances[0].extra_data["word_count"] == 308762
22+
references = instances[0].references
23+
assert len(references[0].output.text) == 2865
24+
assert references[0].tags == [CORRECT_TAG]
25+
26+
scenario = InfiniteBenchSumScenario(min_num_word=0, max_num_word=100e3)
27+
instances = scenario.get_instances(tmpdir)
28+
assert len(instances) == 48
29+
assert instances[0].split == "test"
30+
assert len(instances[0].input.text) == 381778
31+
assert instances[0].extra_data
32+
assert instances[0].extra_data["word_count"] == 69458
33+
references = instances[0].references
34+
assert len(references[0].output.text) == 4217
35+
assert references[0].tags == [CORRECT_TAG]
36+
37+
scenario = InfiniteBenchSumScenario(min_num_word=30e3, max_num_word=80e3)
38+
instances = scenario.get_instances(tmpdir)
39+
assert len(instances) == 32
40+
assert instances[0].split == "test"
41+
assert len(instances[1].input.text) == 383396
42+
assert instances[1].extra_data
43+
assert instances[1].extra_data["word_count"] == 68482
44+
references = instances[1].references
45+
assert len(references[0].output.text) == 5667
46+
assert references[0].tags == [CORRECT_TAG]

0 commit comments

Comments
 (0)