Add HotPotQA and SQuAD scenarios from RULER (#3411)

yifanmai · web-flow · commit c7d48661f93c · 2025-03-05T21:02:05.000-08:00
diff --git a/src/helm/benchmark/run_specs/long_context_run_specs.py b/src/helm/benchmark/run_specs/long_context_run_specs.py
@@ -0,0 +1,70 @@
+from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
+from helm.benchmark.metrics.common_metric_specs import get_open_ended_generation_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+
+
+@run_spec_function("ruler_hotpotqa")
+def get_ruler_hotpotqa_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.ruler_qa_scenarios.RULERHotpotQAScenario", args={}
+    )
+
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        global_prefix="",
+        global_suffix="",
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        reference_prefix="A. ",
+        reference_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        temperature=0.0,
+        max_tokens=512,  # ?
+        stop_sequences=[],
+    )
+
+    return RunSpec(
+        name="ruler_hotpotqa",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_open_ended_generation_metric_specs(),
+        groups=["ruler_hotpotqa"],
+    )
+
+
+@run_spec_function("ruler_squad")
+def get_ruler_squad_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ruler_qa_scenarios.RULERSQuADScenario")
+
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        global_prefix="",
+        global_suffix="",
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        reference_prefix="A. ",
+        reference_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        temperature=0.0,
+        max_tokens=512,  # ?
+        stop_sequences=[],
+    )
+
+    return RunSpec(
+        name="ruler_squad",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_open_ended_generation_metric_specs(),
+        groups=["ruler_squad"],
+    )
diff --git a/src/helm/benchmark/scenarios/ruler_qa_scenario_helper.py b/src/helm/benchmark/scenarios/ruler_qa_scenario_helper.py
@@ -0,0 +1,171 @@
+# flake8: noqa
+# type: ignore
+# fmt: off
+
+import json
+import random
+import re
+from typing import Any, List
+
+import numpy as np
+from tqdm import tqdm
+
+
+# The following code is copied verbatim from:
+# https://github.com/NVIDIA/RULER/blob/860f2bd5c0430569f5941176f9f97f95e770b3da/scripts/data/synthetic/qa.py
+# under the following license:
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+
+# Read SQuAD QA dataset
+def read_squad(file):
+    with open(file) as f:
+        data = json.load(f)
+        
+    total_docs = [p['context'] for d in data['data'] for p in d['paragraphs']]
+    total_docs = sorted(list(set(total_docs)))
+    total_docs_dict = {c: idx for idx, c in enumerate(total_docs)}
+
+    total_qas = []
+    for d in data['data']:
+        more_docs = [total_docs_dict[p['context']] for p in d['paragraphs']]
+        for p in d['paragraphs']:
+            for qas in p['qas']:
+                if not qas['is_impossible']:
+                    total_qas.append({
+                        'query': qas['question'],
+                        'outputs': [a['text'] for a in qas['answers']],
+                        'context': [total_docs_dict[p['context']]],
+                        'more_context': [idx for idx in more_docs if idx != total_docs_dict[p['context']]]
+                    })
+                        
+    return total_qas, total_docs
+
+# Read Hotpot QA dataset
+def read_hotpotqa(file):
+    with open(file) as f:
+        data = json.load(f)
+
+    total_docs = [f"{t}\n{''.join(p)}" for d in data for t, p in d['context']]
+    total_docs = sorted(list(set(total_docs)))
+    total_docs_dict = {c: idx for idx, c in enumerate(total_docs)}
+    
+    total_qas = []
+    for d in data:
+        total_qas.append({
+            'query': d['question'],
+            'outputs': [d['answer']],
+            'context': [total_docs_dict[f"{t}\n{''.join(p)}"] for t, p in d['context']],
+        })
+        
+    return total_qas, total_docs
+
+
+DOCUMENT_PROMPT = "Document {i}:\n{document}"
+
+def generate_input_output(index, num_docs, template: str, random_seed: int, qas: Any, docs: Any):
+    curr_q = qas[index]['query']
+    curr_a = qas[index]['outputs']
+    curr_docs = qas[index]['context']
+    curr_more = qas[index].get('more_context', [])
+    if num_docs < len(docs):
+        if (num_docs - len(curr_docs)) > len(curr_more):
+            addition_docs = [i for i, d in enumerate(docs) if i not in curr_docs + curr_more]
+            all_docs = curr_docs + curr_more + random.sample(addition_docs, max(0, num_docs - len(curr_docs) - len(curr_more)))
+        else:
+            all_docs = curr_docs + random.sample(curr_more, num_docs - len(curr_docs))
+    
+        all_docs = [docs[idx] for idx in all_docs]
+    else:
+        all_docs = docs
+        
+    random.Random(random_seed).shuffle(all_docs)
+    
+    context = '\n\n'.join([DOCUMENT_PROMPT.format(i=i+1, document=d) for i, d in enumerate(all_docs)])
+    input_text = template.format(
+        context=context, 
+        query=curr_q
+    )
+    return input_text, curr_a
+
+
+# The following code has been modified from the original source from:
+# https://github.com/NVIDIA/RULER/blob/860f2bd5c0430569f5941176f9f97f95e770b3da/scripts/data/synthetic/qa.py
+# under the same Apache 2.0 license included above.
+
+
+def _text_to_tokens(text: str) -> List[int]:
+    return re.split(r"\s+", text.strip())
+
+
+def generate_samples(dataset: str, dataset_path: str, template: str, random_seed: int, pre_samples: int, num_samples: int, tokens_to_generate: int, max_seq_length: int, incremental: int = 10, remove_newline_tab: bool = False): 
+    random.seed(random_seed)
+    np.random.seed(random_seed)
+
+    if dataset == 'squad':
+        qas, docs = read_squad(dataset_path)
+    elif dataset == 'hotpotqa':
+        qas, docs = read_hotpotqa(dataset_path)
+    else:
+        raise NotImplementedError(f'{dataset} is not implemented.')
+
+    write_jsons = []
+    tokens_to_generate = tokens_to_generate
+    
+    # Find the perfect num_docs
+    num_docs = incremental
+    
+    total_tokens = 0  # Track the total tokens generated for this example
+    while total_tokens + tokens_to_generate < max_seq_length :  
+        input_text, answer = generate_input_output(0, num_docs, template=template, random_seed=random_seed, qas=qas, docs=docs)
+        # Calculate the number of tokens in the example
+        total_tokens = len(_text_to_tokens(input_text + f' {answer}'))
+        print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
+        if total_tokens + tokens_to_generate > max_seq_length:
+            num_docs -= incremental
+            break
+            
+        num_docs += incremental
+        if num_docs > len(docs):
+            num_docs = len(docs)
+            break
+    print('Number of documents:', num_docs)
+    
+    # Generate samples
+    for index in tqdm(range(num_samples)):
+        used_docs = num_docs
+        while(True):
+            try:
+                input_text, answer = generate_input_output(index + pre_samples, used_docs, template=template, random_seed=random_seed, qas=qas, docs=docs)
+                length = len(_text_to_tokens(input_text)) + tokens_to_generate
+                assert length <= max_seq_length, f"{length} exceeds max_seq_length."
+                break
+            except:
+                if used_docs > incremental:
+                    used_docs -= incremental
+        
+        if remove_newline_tab:
+            input_text = ' '.join(input_text.replace('\n', ' ').replace('\t', ' ').strip().split())
+        
+        formatted_output = {
+            "index": index,
+            "input": input_text,
+            "outputs": answer,
+            "length": length
+        }
+        write_jsons.append(formatted_output)
+
+    return write_jsons
diff --git a/src/helm/benchmark/scenarios/ruler_qa_scenarios.py b/src/helm/benchmark/scenarios/ruler_qa_scenarios.py
@@ -0,0 +1,88 @@
+import os
+from typing import List, Optional
+
+from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+from helm.benchmark.scenarios.ruler_qa_scenario_helper import generate_samples  # type: ignore
+from helm.benchmark.scenarios.scenario import (
+    VALID_SPLIT,
+    Scenario,
+    Instance,
+    Reference,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+
+
+_DATASET_TO_URL = {
+    "hotpotqa": "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json",
+    "squad": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json",
+}
+
+
+class _RULERQAScenario(Scenario):
+    name = "ruler_qa"
+    description = "A QA scenario from Ruler"
+    tags = ["long_context", "rag"]
+
+    _TEMPLATE = """Answer the question based on the given documents. Only give me the answer and do not output any other words.
+
+The following are given documents.
+
+{context}
+
+Answer the question based on the given documents. Only give me the answer and do not output any other words.
+
+Question: {query} Answer:"""  # noqa: E501
+
+    def __init__(self, dataset: str, max_sequence_length: Optional[int] = None):
+        super().__init__()
+        self.dataset = dataset or "hotpotqa"
+        self.max_sequence_length = max_sequence_length or 32768
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(data_dir)
+        file_path = os.path.join(data_dir, f"{self.dataset}.json")
+        url = _DATASET_TO_URL[self.dataset]
+        ensure_file_downloaded(url, file_path)
+        instances: List[Instance] = []
+        samples = generate_samples(
+            dataset=self.dataset,
+            dataset_path=file_path,
+            max_seq_length=self.max_sequence_length,
+            tokens_to_generate=32,
+            num_samples=500,
+            random_seed=42,
+            pre_samples=0,
+            template=self._TEMPLATE,
+        )
+        for sample in samples:
+            instance = Instance(
+                id=sample["index"],
+                input=Input(text=sample["input"]),
+                references=[
+                    Reference(Output(text=output_text), tags=[CORRECT_TAG]) for output_text in sample["outputs"]
+                ],
+                split=VALID_SPLIT,
+            )
+            instances.append(instance)
+        return instances
+
+
+class RULERHotpotQAScenario(_RULERQAScenario):
+    name = "ruler_hotpotqa"
+    description = "The HotpotQA long-context multi-hop RAG question answering scenario from RULER"
+    tags = ["long_context", "rag"]
+
+    def __init__(self, dataset: Optional[str] = None, max_sequence_length: Optional[int] = None):
+        super().__init__("hotpotqa", max_sequence_length)
+
+
+class RULERSQuADScenario(_RULERQAScenario):
+    name = "ruler_squad"
+    description = "The SQuAD question answering scenario from RULER"
+    tags = ["long_context", "rag"]
+
+    def __init__(self, dataset: Optional[str] = None, max_sequence_length: Optional[int] = None):
+        super().__init__("squad", max_sequence_length)
diff --git a/src/helm/benchmark/static/schema_long_context.yaml b/src/helm/benchmark/static/schema_long_context.yaml