stanford-crfm
diff --git a/‎requirements.txt
+7-1 b/‎requirements.txt
+7-1
diff --git a/‎setup.cfg
+9 b/‎setup.cfg
+9
diff --git a/‎src/helm/benchmark/annotation/ehr_sql_annotator.py
+87 b/‎src/helm/benchmark/annotation/ehr_sql_annotator.py
+87
diff --git a/‎src/helm/benchmark/metrics/basic_metrics.py
+2-2 b/‎src/helm/benchmark/metrics/basic_metrics.py
+2-2
diff --git a/‎src/helm/benchmark/metrics/ehr_sql_metrics.py
+103 b/‎src/helm/benchmark/metrics/ehr_sql_metrics.py
+103
diff --git a/‎src/helm/benchmark/metrics/medcalc_bench_metrics.py
+124 b/‎src/helm/benchmark/metrics/medcalc_bench_metrics.py
+124
@@ -14,13 +14,14 @@ anthropic==0.38.0
 antlr4-python3-runtime==4.9.3
 anyio==4.8.0
 astunparse==1.6.3
-async-timeout==5.0.1
+async-timeout==4.0.3
 attrs==24.3.0
 audioread==3.0.1
 autokeras==1.0.20
 av==14.0.1
 awscli==1.33.44
 beautifulsoup4==4.12.3
+bert_score==0.3.13
 black==24.3.0
 blis==1.1.0
 boto3==1.34.162
@@ -131,6 +132,8 @@ keras==3.8.0
 keras-tuner==1.4.7
 kiwisolver==1.4.7
 kt-legacy==1.0.5
+langchain==0.3.9
+langchain-community==0.3.8
 langcodes==3.5.0
 langdetect==1.0.9
 language_data==1.3.0
@@ -223,6 +226,7 @@ pypinyin==0.49.0
 PySocks==1.7.1
 pytest==7.2.2
 python-dateutil==2.8.2
+python-docx==1.1.2
 python-utils==3.9.1
 pytorch-fid==0.3.0
 pytorch-lightning==2.0.9.post0
@@ -232,6 +236,8 @@ PyWavelets==1.6.0
 PyYAML==6.0.2
 qwen-vl-utils==0.0.8
 RapidFuzz==3.11.0
+rank_bm25==0.2.2
+referencing==0.35.1
 regex==2024.11.6
 reka-api==2.0.0
 requests==2.32.3
 
@@ -279,6 +279,15 @@ heim =
     # Shared image dependencies
     crfm-helm[images]
 
+medhelm = 
+    # Summarization metrics
+    crfm-helm[summarization]
+
+    #MedHELM scenarios
+    python-docx~=1.1.2
+    langchain~=0.3.9
+    lxml~=5.3.0
+
 audiolm =
     crfm-helm[openai]
     crfm-helm[google]
 
@@ -0,0 +1,87 @@
+from typing import Any, List, Optional
+import os
+import re
+import sqlite3
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.annotator import Annotator
+from helm.common.hierarchical_logger import hlog
+from helm.benchmark.runner import get_benchmark_output_path
+
+
+class EhrSqlAnnotator(Annotator):
+    """
+    Executes both ground truth and generated SQL queries on the eicu.sqlite database.
+    """
+
+    name = "ehr_sql"
+
+    def annotate(self, request_state: RequestState) -> Any:
+        """Evaluate SQL execution accuracy by running queries against the eicu.sqlite database."""
+
+        databases_root_path = os.path.join(get_benchmark_output_path(), "scenarios", "ehr_sql")
+        database_path = os.path.join(databases_root_path, "eicu.sqlite")
+
+        assert len(request_state.instance.references) == 1
+        ground_truth_sql = request_state.instance.references[0].output.text.strip()
+        ground_truth_result: List[str] = []
+
+        # Execute the ground truth query
+        try:
+            with sqlite3.connect(database_path) as conn:
+                cursor = conn.cursor()
+                cursor.execute(ground_truth_sql)
+                ground_truth_result = cursor.fetchall()
+        except (sqlite3.OperationalError, sqlite3.Warning) as e:
+            hlog(f"WARNING: Ground truth SQL failed with error: {e}")
+
+        # If ground truth SQL execution didn't return results, attempt to use extra_data["value"]
+        if not ground_truth_result and request_state.instance.extra_data is not None:
+            if "value" in request_state.instance.extra_data:
+                extra_values = list(request_state.instance.extra_data["value"].values())
+
+                # Try inferring types from the database schema if possible
+                with sqlite3.connect(database_path) as conn:
+                    cursor = conn.cursor()
+                    try:
+                        cursor.execute(ground_truth_sql)
+                        fetched_result = cursor.fetchone()
+                        if fetched_result:
+                            # Convert extra_values to match SQLite's expected types
+                            converted_values = [
+                                type(fetched_result[i])(extra_values[i]) for i in range(len(extra_values))
+                            ]
+                            ground_truth_result = converted_values
+                        else:
+                            # If no rows were fetched, use `extra_values` as-is
+                            ground_truth_result = extra_values
+                    except sqlite3.OperationalError:
+                        # If query fails (syntax error, etc.), just use `extra_values` as-is
+                        ground_truth_result = extra_values
+
+        assert request_state.result is not None
+        assert len(request_state.result.completions) == 1
+        predicted_text = request_state.result.completions[0].text.strip()
+
+        predicted_sql_match = re.search(r"<\s*sql\s*>(.*?)<\/?\s*sql\s*>", predicted_text, re.DOTALL | re.IGNORECASE)
+        predicted_sql = predicted_sql_match.group(1).strip() if predicted_sql_match else predicted_text.strip()
+
+        predicted_result: List[str] = []
+        query_error: Optional[str] = None
+        predicted_sql = predicted_sql.replace("`", "").strip()
+        predicted_sql = re.sub(r"^sql\n", "", predicted_sql, flags=re.MULTILINE)
+        if not predicted_sql:
+            query_error = "No query generated"
+        else:
+            try:
+                with sqlite3.connect(database_path) as conn:
+                    cursor = conn.cursor()
+                    cursor.execute(predicted_sql)
+                    predicted_result = cursor.fetchall()
+            except (sqlite3.OperationalError, sqlite3.Warning) as e:
+                query_error = str(e)
+
+        return {
+            "predicted_result": predicted_result,
+            "ground_truth_result": ground_truth_result,
+            "query_error": query_error,
+        }
@@ -5,8 +5,8 @@
 from urllib.parse import unquote
 
 import numpy as np
-import scipy
-import calibration as cal
+import scipy  # type: ignore
+import calibration as cal  # type: ignore
 from helm.benchmark.adaptation.scenario_state import ScenarioState
 from helm.benchmark.metrics.evaluate_reference_metrics import compute_reference_metrics
 from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
 
@@ -0,0 +1,103 @@
+from typing import List
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.common.hierarchical_logger import hlog
+
+
+class EhrSqlMetric(Metric):
+    """
+    Metric for evaluating the EHR SQL dataset, focusing on:
+    1. Execution Accuracy – Whether the generated SQL query produces the same results as the ground truth.
+    2. Query Validity – Whether the generated SQL executes without errors.
+    3. Precision for Answerable Questions (Pans).
+    4. Recall for Answerable Questions (Rans).
+    """
+
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        """
+        Evaluate execution accuracy, query validity, and answerability metrics.
+        """
+
+        if not request_state.annotations:
+            hlog(f"Warning: Request state missing annotations for instance {request_state.instance}")
+            return []
+
+        if "ehr_sql" not in request_state.annotations:
+            hlog(f"Warning: 'ehr_sql' key missing in annotations for instance {request_state.instance}")
+            return []
+
+        # Extract execution results
+        predicted_result = request_state.annotations["ehr_sql"].get("predicted_result", [])
+        ground_truth_result = request_state.annotations["ehr_sql"].get("ground_truth_result", [])
+        query_error = request_state.annotations["ehr_sql"].get("query_error", None)
+
+        # Extract predictions from the model output
+        if request_state.result is None:
+            predictions = []
+        else:
+            predictions = [completion.text.strip() for completion in request_state.result.completions]
+        if not predictions:
+            hlog(f"Warning: No predictions found in the completions for instance {request_state.instance}")
+            return []
+
+        # Process the first prediction as the primary output
+        prediction = predictions[0].strip()
+
+        # Extract references and input text
+        references = getattr(request_state.instance, "references", None)
+
+        if not references or len(references) == 0:
+            hlog(f"Warning: Missing references for instance {request_state.instance}")
+            return []
+
+        # Check if the ground truth is answerable based on `is_impossible` flag
+        ground_truth_query = references[0].output.text.strip() if references else None
+        is_impossible = (
+            request_state.instance.extra_data.get("is_impossible", False)
+            if request_state.instance.extra_data
+            else False
+        )
+
+        is_answerable = not is_impossible and bool(ground_truth_query)  # True if the ground truth is answerable
+        is_predicted_answerable = bool(prediction)  # True if the model generated a non-empty SQL query
+        correct_answerable = int(is_answerable and is_predicted_answerable)  # Correct if both are answerable
+
+        # **Execution Accuracy Fix:**
+        execution_accuracy = 0
+
+        if ground_truth_query:
+            if ground_truth_result and predicted_result:
+                execution_accuracy = int(set(predicted_result) == set(ground_truth_result))  # Compare sets.
+            elif not ground_truth_result and not predicted_result and not prediction:
+                execution_accuracy = 1  # Both empty and no query was generated.
+        elif not ground_truth_query and prediction:
+            execution_accuracy = 0  # LLM generated a query when no gold query exists.
+
+        # **Query Validity Fix:**
+        if not prediction:  # No SQL query was generated
+            query_validity = 0
+        elif query_error is None:
+            query_validity = 1  # Query executed successfully.
+        else:
+            query_validity = 0  # Execution error occurred.
+
+        return [
+            # Execution-based Metrics
+            Stat(MetricName("ehr_sql_execution_accuracy")).add(execution_accuracy),
+            Stat(MetricName("ehr_sql_query_validity")).add(query_validity),
+            # Answerability Metrics
+            Stat(MetricName("ehr_sql_precision_answerable")).add(correct_answerable if is_predicted_answerable else 0),
+            Stat(MetricName("ehr_sql_recall_answerable")).add(correct_answerable if is_answerable else 0),
+            Stat(MetricName("ehr_sql_total_predicted_answerable")).add(int(is_predicted_answerable)),
+            Stat(MetricName("ehr_sql_total_ground_truth_answerable")).add(int(is_answerable)),
+        ]
@@ -0,0 +1,124 @@
+import re
+
+from datetime import datetime
+from typing import List, Dict, Any
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.common.hierarchical_logger import hlog
+
+
+class MedCalcBenchMetric(Metric):
+    """
+    Metric for evaluating the MedCalc Bench dataset, assessing the model's ability to
+    be a clinical calculator.
+
+    Exact match based on category:
+    1. Normal exact match: for categories "risk", "severity" or "diagnosis".
+    2. Variant exact match: for other categories, if the number calculated by the model falls between the values
+        in the Lower limit and Upper limit columns, we mark it as accurate.
+    """
+
+    def parse_duration(self, duration_str) -> int:
+        """Parses a duration tuple (weeks, days) from a string format like ('14 weeks', '2 days')."""
+        match = re.match(r"\('(\d+) weeks', '(\d+) days'\)", duration_str)
+        if match:
+            weeks, days = map(int, match.groups())
+            return weeks * 7 + days  # Convert to total days
+        else:
+            raise ValueError(f"Invalid format: {duration_str}")
+
+    def is_within_range(self, lower_bound, upper_bound, prediction) -> int:
+        """
+        Checks if a predicted duration falls within the given range.
+
+        Args:
+            lower_bound (str): The lower bound in format "('X weeks', 'Y days')".
+            upper_bound (str): The upper bound in format "('X weeks', 'Y days')".
+            prediction (str): The predicted duration in the same format.
+
+        Returns:
+            int: 1 if within range (inclusive), 0 otherwise.
+        """
+        lower_days = self.parse_duration(lower_bound)
+        upper_days = self.parse_duration(upper_bound)
+        prediction_days = self.parse_duration(prediction)
+        return 1 if lower_days <= prediction_days <= upper_days else 0
+
+    def check_date(self, prediction: str, reference: str, extra_data: Dict[str, Any]) -> int:
+        """Checks if prediction date is withing limits"""
+        if re.match(r"\('(\d+) weeks', '(\d+) days'\)", reference):
+            exact_match = self.is_within_range(extra_data["lower_limit"], extra_data["upper_limit"], prediction)
+        else:
+            prediction_date = self._str_to_date(prediction)
+            upper_limit_date = self._str_to_date(extra_data["upper_limit"])
+            lower_limit_date = self._str_to_date(extra_data["lower_limit"])
+            exact_match = 1 if lower_limit_date <= prediction_date <= upper_limit_date else 0
+        return exact_match
+
+    def _str_to_date(self, date_str: str) -> datetime:
+        """Convert string to datetime object."""
+        return datetime.strptime(date_str, "%m/%d/%Y")
+
+    def check_in_range(self, prediction: str, reference: str, extra_data: Dict[str, Any], category: str) -> int:
+        """Check if the prediction falls within the range specified by the reference."""
+        try:
+            if category == "date":
+                exact_match = self.check_date(prediction, reference, extra_data)
+            elif category in ["dosage conversion", "physical"]:
+                lower_limit = float(extra_data["lower_limit"])
+                upper_limit = float(extra_data["upper_limit"])
+                float_prediction = float(prediction)
+                exact_match = 1 if lower_limit <= float_prediction <= upper_limit else 0
+            else:
+                raise ValueError(f"Category {category} not supported")
+        except ValueError:
+            return 0
+
+        return exact_match
+
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        """
+        Evaluate a single generation against reference labels.
+        """
+        # Extract predictions
+        assert request_state.result, "request_state.result is unexpectedly None"
+        predictions = [completion.text.strip() for completion in request_state.result.completions]
+
+        if not predictions:
+            hlog("Warning: No predictions found in completions")
+            return []
+
+        # Get the first prediction
+        prediction = predictions[0]
+
+        # Get references
+        references = getattr(request_state.instance, "references", None)
+
+        if not references or len(references) == 0:
+            hlog(f"Warning: Missing references for instance {request_state.instance}")
+            return []
+
+        reference = references[0].output.text
+
+        # Extract category, upper limit and lower limit
+        assert request_state.instance.extra_data, "Extra data dict was expected but got None"
+        category = request_state.instance.extra_data["category"]
+
+        if category in ["risk", "severity", "diagnosis"]:
+            exact_match = 1 if prediction == reference else 0
+        else:
+            exact_match = self.check_in_range(prediction, reference, request_state.instance.extra_data, category)
+
+        return [
+            Stat(MetricName("medcalc_bench_accuracy")).add(exact_match),
+        ]