Added the MIMIC-IV-BHC benchmark to MedHelm scenarios (#3459)

asad-aali · web-flow · commit d556e189c129 · 2025-03-24T17:10:17.000-07:00
diff --git a/src/helm/benchmark/presentation/run_entries_medhelm.conf b/src/helm/benchmark/presentation/run_entries_medhelm.conf
@@ -121,6 +121,14 @@ entries: [
   {description: "mimic_rrs:model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
   {description: "mimic_rrs:model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
 
+  ### Summarizing Discharge Notes ###
+  {description: "mimic_bhc:model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},
+  {description: "mimic_bhc:model=openai/gpt-4o-2024-05-13,model_deployment=stanfordhealthcare/gpt-4o-2024-05-13", priority: 1},
+  {description: "mimic_bhc:model=openai/gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare/gpt-4o-mini-2024-07-18", priority: 1},
+  {description: "mimic_bhc:model=meta/llama-3.3-70b-instruct,model_deployment=stanfordhealthcare/llama-3.3-70b-instruct", priority: 1},
+  {description: "mimic_bhc:model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
+  {description: "mimic_bhc:model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
+
   ### Documenting Care Plans ###
   {description: "chw_care_plan:model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},
   {description: "chw_care_plan:model=openai/gpt-4o-2024-05-13,model_deployment=stanfordhealthcare/gpt-4o-2024-05-13", priority: 1},
diff --git a/src/helm/benchmark/run_specs/medhelm_run_specs.py b/src/helm/benchmark/run_specs/medhelm_run_specs.py
@@ -518,6 +518,35 @@ def get_mimic_rrs_spec() -> RunSpec:
     )
 
 
+@run_spec_function("mimic_bhc")
+def get_mimic_bhc_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.mimic_bhc_scenario.MIMICBHCScenario", args={})
+
+    adapter_spec = get_generation_adapter_spec(
+        instructions=("Summarize the clinical note into a brief hospital course."),
+        input_noun="Clinical Note",
+        output_noun="Brief Hospital Course",
+        newline_after_input_noun=True,
+        newline_after_output_noun=True,
+        max_tokens=1024,
+        max_train_instances=0,
+        stop_sequences=[],
+    )
+    metric_args = {
+        "task": "mimic_bhc",
+        "device": get_torch_device_name(),
+        "bertscore_model": "distilbert-base-uncased",
+        "rescale_with_baseline": False,
+    }
+    return RunSpec(
+        name="mimic_bhc",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_summarization_metric_specs(metric_args),
+        groups=["mimic_bhc"],
+    )
+
+
 @run_spec_function("chw_care_plan")
 def get_chw_care_plan_run_spec() -> RunSpec:
     """
diff --git a/src/helm/benchmark/scenarios/mimic_bhc_scenario.py b/src/helm/benchmark/scenarios/mimic_bhc_scenario.py
@@ -0,0 +1,97 @@
+import json
+from typing import Dict, List
+
+from helm.common.general import ensure_directory_exists
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Reference,
+    Output,
+)
+
+
+class MIMICBHCScenario(Scenario):
+    """
+    MIMIC-IV-BHC presents a curated collection of preprocessed discharge notes with labeled brief hospital
+    course (BHC) summaries. This dataset is derived from MIMIC-IV (https://doi.org/10.1093/jamia/ocae312).
+
+    In total, the dataset contains 270,033 clinical notes.
+    The splits are provided by the dataset itself.
+
+    Sample Synthetic Prompt:
+        Summarize the clinical note into a brief hospital course.
+
+        Clinical Note:
+        <SEX> M <SERVICE> SURGERY <ALLERGIES> No Known Allergies \/ Adverse Drug Reactions
+        ...
+        continue to follow-up with your health care providers as an outpatient.
+
+        Brief Hospital Course:
+        Mr. ___ was pre-admitted on ___ for liver transplantation
+        ...
+        discharged home to continue home medications and follow-up as an outpatient.
+
+    @article{aali2024dataset,
+        title={A dataset and benchmark for hospital course summarization with adapted large language models},
+        author={Aali, Asad and Van Veen, Dave and Arefeen, YI and Hom, Jason and Bluethgen, Christian
+        and Reis, Eduardo Pontes and Gatidis, Sergios and Clifford, Namuun and Daws, Joseph
+        and Tehrani, Arash and Kim, Jangwon and Chaudhari, Akshay},
+        journal={Journal of the American Medical Informatics Association},
+        volume={32},
+        number={3},
+        pages={470--479},
+        year={2024},
+        publisher={Oxford University Press}
+    }
+
+    @article{aali2024mimic,
+        title={MIMIC-IV-Ext-BHC: Labeled Clinical Notes Dataset for Hospital Course Summarization},
+        author={Aali, Asad and Van Veen, Dave and Arefeen, YI and Hom, Jason and Bluethgen, Christian
+        and Reis, Eduardo Pontes and Gatidis, Sergios and Clifford, Namuun and Daws, Joseph
+        and Tehrani, Arash and Kim, Jangwon and Chaudhari, Akshay},
+        journal={PhysioNet},
+        year={2024}
+    }
+    """
+
+    name = "mimic_bhc"
+    description = (
+        "A summarization task using a curated collection of preprocessed discharge notes"
+        " paired with their corresponding brief hospital course (BHC) summaries."
+    )
+    tags = ["summarization", "biomedical"]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path = "/share/pi/nigam/data/bhc-mimiciv/mimic_iv_bhc.json"
+        ensure_directory_exists(data_path)
+
+        instances: List[Instance] = []
+        # Limit to zero shot setting for now
+        splits: Dict[str, str] = {
+            # "train": TRAIN_SPLIT,
+            # "validate": VALID_SPLIT,
+            "test": TEST_SPLIT,
+        }
+
+        with open(data_path, "r") as f:
+            data = [json.loads(line) for line in f]
+
+        for data_split, split in splits.items():
+            clinical_notes: List[str] = [x["input"] for x in data]
+            bhc_summaries: List[str] = [x["target"] for x in data]
+            assert len(clinical_notes) == len(bhc_summaries), "Notes and summaries must have the same length"
+            for clinical_note, bhc_summary in zip(clinical_notes, bhc_summaries):
+                if not clinical_note or not bhc_summary:
+                    continue
+                instances.append(
+                    Instance(
+                        input=Input(text=clinical_note),
+                        references=[Reference(Output(text=bhc_summary), tags=[CORRECT_TAG])],
+                        split=split,
+                    )
+                )
+
+        return instances
diff --git a/src/helm/benchmark/static/schema_medhelm.yaml b/src/helm/benchmark/static/schema_medhelm.yaml
@@ -432,6 +432,7 @@ run_groups:
       - aci_bench
       - mtsamples_procedures
       - mimic_rrs
+      - mimic_bhc
       - chw_care_plan
   
   - name: patient_communication
@@ -676,6 +677,24 @@ run_groups:
       who: Radiologist
       when: Post-imaging
       language: English
+
+  - name: mimic_bhc
+    display_name: MIMIC-IV-BHC
+    short_display_name: MIMIC-BHC
+    description: A summarization task using a curated collection of preprocessed discharge notes paired with their corresponding brief hospital course (BHC) summaries [(Aali et al., 2024)](https://doi.org/10.1093/jamia/ocae312).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: BERTScore-F
+      main_split: test
+    taxonomy:
+      task: Text generation
+      what: Summarize the clinical note into a brief hospital course
+      who: Clinician
+      when: Upon hospital discharge
+      language: English
   
   - name: mimiciv_billing_code
     display_name: MIMIC-IV Billing Code