Add ConvFinQACalc (#3453)

yifanmai · ryokawajp · mtake · web-flow · commit 7f44dd3ebcd5 · 2025-03-24T17:10:47.000-07:00
Co-authored-by: Ryo Kawahara &lt;ryokawa@jp.ibm.com&gt;
Co-authored-by: Mikio Takeuchi &lt;mtake@jp.ibm.com&gt;
diff --git a/src/helm/benchmark/metrics/conv_fin_qa_calc_metrics.py b/src/helm/benchmark/metrics/conv_fin_qa_calc_metrics.py
@@ -0,0 +1,72 @@
+import re
+from typing import Any, List
+
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.scenarios.scenario import CORRECT_TAG
+from helm.common.hierarchical_logger import hlog
+
+
+def _strip_string(str: str) -> Any:
+    # from https://stackoverflow.com/a/4703508
+    numeric_const_pattern = r"[-+]?(?:(?:\d*\.\d+)|(?:\d+\.?))(?:[Ee][+-]?\d+)?"
+    match = re.search(numeric_const_pattern, str)
+    if match:
+        try:
+            return float(str[match.start() : match.end()])
+        except Exception:
+            return None
+    return None
+
+
+def float_equiv(str1: str, str2: str, eps: float = 1e-6) -> float:
+    """Check if two values have the same float value, up to a small tolerance.
+
+    This is the implementation used in the IBM Enterprise Benchmark paper.
+
+    Note: This is a "mostly-correct" equality function and does not handle some cases correctly:
+
+    - If both values are non-floats, then it will always return 1.0,
+      regardless of whether strings match.
+    - If either of both values have different units (e.g. currency symbols,
+      trailing "M" or "B", trailing %), the values will not be converted to the same
+      units before comparison.
+    """
+    try:
+        ss1 = _strip_string(str1)
+        ss2 = _strip_string(str2)
+
+        if ss1 is None or ss2 is None:
+            hlog("WARNING: float_equiv returning 1.0 because both values are non-floats")
+            return 0.0
+        return float(abs(ss1 - ss2) < eps)
+    except Exception:
+        return float(str1 == str2)
+
+
+class ConvFinQACalcMetric(Metric):
+    """Score metrics for AIRBench 2024."""
+
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        model_answer = request_state.result.completions[0].text
+
+        assert len(request_state.instance.references) == 1
+        assert len(request_state.instance.references[0].tags) == 1
+        assert request_state.instance.references[0].tags[0] == CORRECT_TAG
+        gold_answer = request_state.instance.references[0].output.text
+
+        return [
+            Stat(MetricName("float_equiv")).add(float_equiv(model_answer, gold_answer)),
+        ]
diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py
@@ -75,6 +75,31 @@ def get_financial_phrasebank_spec(agreement: int = 50) -> RunSpec:
     )
 
 
+@run_spec_function("conv_fin_qa_calc")
+def get_conv_fin_qa_calc_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.conv_fin_qa_calc_scenario.ConvFinQACalcScenario", args={}
+    )
+
+    adapter_spec = get_generation_adapter_spec(
+        instructions="Based on the table, answer the final question. Respond with the answer only, with no additional explanation.",  # noqa: E501
+        input_noun=None,
+        output_noun="Answer",
+    )
+
+    metric_specs = [
+        MetricSpec(class_name="helm.benchmark.metrics.conv_fin_qa_calc_metrics.ConvFinQACalcMetric")
+    ] + get_basic_metric_specs([])
+
+    return RunSpec(
+        name="conv_fin_qa_calc",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["conv_fin_qa_calc"],
+    )
+
+
 # Legal
 
 
diff --git a/src/helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py b/src/helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py
@@ -0,0 +1,97 @@
+import json
+import os
+from typing import Dict, List, Any
+
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    CORRECT_TAG,
+    Output,
+)
+from helm.common.general import ensure_file_downloaded
+
+
+class ConvFinQACalcScenario(Scenario):
+    """A mathematical calculation benchmark based on ConvFinQA.
+
+    Data source:
+    https://github.com/czyssrs/ConvFinQA
+
+    Reference:
+    Zhiyu Chen, Shiyang Li, Charese Smiley, Zhiqiang Ma, Sameena Shah, and William Yang Wang. 2022.
+    ConvFinQA: Exploring the Chain of Numerical Reasoning in Conversational Finance Question Answering.
+    In Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing,
+    pages 6279–6292, Abu Dhabi, United Arab Emirates. Association for Computational Linguistics.
+    https://aclanthology.org/2022.emnlp-main.421
+    """  # noqa: E501
+
+    name = "conv_fin_qa_calc"
+    description = "A mathematical calculation benchmark based on ConvFinQA: Exploring the Chain of Numerical Reasoning in Conversational Finance Question Answering [(Chen ey al., 2022)](https://arxiv.org/pdf/2210.03849.pdf)."  # noqa: E501
+    tags = ["question_answering", "finance"]
+
+    DATASET_DOWNLOAD_URL: str = (
+        "https://github.com/czyssrs/ConvFinQA/raw/cf3eed2d5984960bf06bb8145bcea5e80b0222a6/data.zip"
+    )
+
+    _SPLIT_TO_JSON_FILE_NAME: Dict[str, str] = {TRAIN_SPLIT: "train_turn.json", VALID_SPLIT: "dev_turn.json"}
+
+    def make_pseudo_markdown_table(self, table: List[List[Any]], sep: str = "\n") -> str:
+        markdown_lines: List[str] = []
+
+        for row in table:
+            row_inner_markdown = " | ".join([str(cell) for cell in row])
+            row_markdown = f"| {row_inner_markdown} |"
+            markdown_lines.append(row_markdown)
+
+        return sep.join(markdown_lines)
+
+    def convert_to_instance(self, dic: Dict[str, Any], split: str, sep: str = "\n") -> Instance:
+        linearized_table = self.make_pseudo_markdown_table(dic["table"])
+        input_text = f"Table: {sep}{linearized_table}{sep}{sep}"
+
+        if "gold_ind" in dic["annotation"]:
+            facts = dic["annotation"]["gold_ind"]
+        elif "gold_inds" in dic["annotation"]:
+            facts = dic["annotation"]["gold_inds"]
+        else:
+            facts = {}
+        table_text = ""
+        for fact_type, fact in facts.items():
+            if "text" in fact_type:
+                table_text += fact
+        if table_text:
+            input_text += f"Text: {sep}{table_text}{sep}{sep}"
+
+        for ind, q in enumerate(dic["annotation"]["cur_dial"]):
+            if ind < len(dic["annotation"]["cur_dial"]) - 1:
+                input_text += f"Question: {q}{sep}Answer: {dic['annotation']['exe_ans_list'][ind]}{sep}"
+            else:
+                input_text += f"Question: {q}"
+
+        answer = str(dic["annotation"]["exe_ans"])
+        return Instance(
+            input=Input(text=input_text),
+            references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
+            split=split,
+        )
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path = os.path.join(output_path, "data")
+        ensure_file_downloaded(
+            source_url=self.DATASET_DOWNLOAD_URL,
+            target_path=os.path.join(output_path, "data"),
+            unpack=True,
+            unpack_type="unzip",
+        )
+        instances: List[Instance] = []
+        for split, json_file_name in self._SPLIT_TO_JSON_FILE_NAME.items():
+            json_file_path = os.path.join(data_path, json_file_name)
+            with open(json_file_path) as f:
+                raw_instances = json.load(f)
+                for raw_instance in raw_instances:
+                    instances.append(self.convert_to_instance(raw_instance, split))
+        return instances
diff --git a/src/helm/benchmark/static/schema_enterprise.yaml b/src/helm/benchmark/static/schema_enterprise.yaml
@@ -72,6 +72,10 @@ metrics:
     display_name: Weighted F1
     description: Weighted F1 score
     lower_is_better: false
+  - name: float_equiv
+    display_name: Float Equivalence
+    description: Float Equivalence
+    lower_is_better: false
 
 ############################################################
 perturbations: []
@@ -114,6 +118,7 @@ run_groups:
     subgroups:
       - gold_commodity_news
       - financial_phrasebank
+      - conv_fin_qa_calc
 
   - name: legal_scenarios
     display_name: Legal Scenarios
@@ -156,6 +161,23 @@ run_groups:
       when: before 2013
       language: English
 
+  - name: conv_fin_qa_calc
+    display_name: ConvFinQACalc
+    description: "A mathematical calculation benchmark based on ConvFinQA: Exploring the Chain of Numerical Reasoning in Conversational Finance Question Answering [(Chen ey al., 2022)](https://arxiv.org/pdf/2210.03849.pdf)."
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: float_equiv
+      main_split: valid
+    taxonomy:
+      task: question answering with numeric reasoning
+      what: financial reports
+      who: financial experts
+      when: 1999 to 2019
+      language: English
+
   - name: gold_commodity_news
     display_name: Gold Commodity News
     description: A classification benchmark based on a dataset of human-annotated gold commodity news headlines ([Sinha & Khandait, 2019](https://arxiv.org/abs/2009.04202)).