|
| 1 | +import json |
| 2 | +from typing import Dict, List |
| 3 | + |
| 4 | +from helm.common.general import ensure_directory_exists |
| 5 | +from helm.benchmark.scenarios.scenario import ( |
| 6 | + Input, |
| 7 | + Scenario, |
| 8 | + Instance, |
| 9 | + TEST_SPLIT, |
| 10 | + CORRECT_TAG, |
| 11 | + Reference, |
| 12 | + Output, |
| 13 | +) |
| 14 | + |
| 15 | + |
| 16 | +class MIMICBHCScenario(Scenario): |
| 17 | + """ |
| 18 | + MIMIC-IV-BHC presents a curated collection of preprocessed discharge notes with labeled brief hospital |
| 19 | + course (BHC) summaries. This dataset is derived from MIMIC-IV (https://doi.org/10.1093/jamia/ocae312). |
| 20 | +
|
| 21 | + In total, the dataset contains 270,033 clinical notes. |
| 22 | + The splits are provided by the dataset itself. |
| 23 | +
|
| 24 | + Sample Synthetic Prompt: |
| 25 | + Summarize the clinical note into a brief hospital course. |
| 26 | +
|
| 27 | + Clinical Note: |
| 28 | + <SEX> M <SERVICE> SURGERY <ALLERGIES> No Known Allergies \/ Adverse Drug Reactions |
| 29 | + ... |
| 30 | + continue to follow-up with your health care providers as an outpatient. |
| 31 | +
|
| 32 | + Brief Hospital Course: |
| 33 | + Mr. ___ was pre-admitted on ___ for liver transplantation |
| 34 | + ... |
| 35 | + discharged home to continue home medications and follow-up as an outpatient. |
| 36 | +
|
| 37 | + @article{aali2024dataset, |
| 38 | + title={A dataset and benchmark for hospital course summarization with adapted large language models}, |
| 39 | + author={Aali, Asad and Van Veen, Dave and Arefeen, YI and Hom, Jason and Bluethgen, Christian |
| 40 | + and Reis, Eduardo Pontes and Gatidis, Sergios and Clifford, Namuun and Daws, Joseph |
| 41 | + and Tehrani, Arash and Kim, Jangwon and Chaudhari, Akshay}, |
| 42 | + journal={Journal of the American Medical Informatics Association}, |
| 43 | + volume={32}, |
| 44 | + number={3}, |
| 45 | + pages={470--479}, |
| 46 | + year={2024}, |
| 47 | + publisher={Oxford University Press} |
| 48 | + } |
| 49 | +
|
| 50 | + @article{aali2024mimic, |
| 51 | + title={MIMIC-IV-Ext-BHC: Labeled Clinical Notes Dataset for Hospital Course Summarization}, |
| 52 | + author={Aali, Asad and Van Veen, Dave and Arefeen, YI and Hom, Jason and Bluethgen, Christian |
| 53 | + and Reis, Eduardo Pontes and Gatidis, Sergios and Clifford, Namuun and Daws, Joseph |
| 54 | + and Tehrani, Arash and Kim, Jangwon and Chaudhari, Akshay}, |
| 55 | + journal={PhysioNet}, |
| 56 | + year={2024} |
| 57 | + } |
| 58 | + """ |
| 59 | + |
| 60 | + name = "mimic_bhc" |
| 61 | + description = ( |
| 62 | + "A summarization task using a curated collection of preprocessed discharge notes" |
| 63 | + " paired with their corresponding brief hospital course (BHC) summaries." |
| 64 | + ) |
| 65 | + tags = ["summarization", "biomedical"] |
| 66 | + |
| 67 | + def get_instances(self, output_path: str) -> List[Instance]: |
| 68 | + data_path = "/share/pi/nigam/data/bhc-mimiciv/mimic_iv_bhc.json" |
| 69 | + ensure_directory_exists(data_path) |
| 70 | + |
| 71 | + instances: List[Instance] = [] |
| 72 | + # Limit to zero shot setting for now |
| 73 | + splits: Dict[str, str] = { |
| 74 | + # "train": TRAIN_SPLIT, |
| 75 | + # "validate": VALID_SPLIT, |
| 76 | + "test": TEST_SPLIT, |
| 77 | + } |
| 78 | + |
| 79 | + with open(data_path, "r") as f: |
| 80 | + data = [json.loads(line) for line in f] |
| 81 | + |
| 82 | + for data_split, split in splits.items(): |
| 83 | + clinical_notes: List[str] = [x["input"] for x in data] |
| 84 | + bhc_summaries: List[str] = [x["target"] for x in data] |
| 85 | + assert len(clinical_notes) == len(bhc_summaries), "Notes and summaries must have the same length" |
| 86 | + for clinical_note, bhc_summary in zip(clinical_notes, bhc_summaries): |
| 87 | + if not clinical_note or not bhc_summary: |
| 88 | + continue |
| 89 | + instances.append( |
| 90 | + Instance( |
| 91 | + input=Input(text=clinical_note), |
| 92 | + references=[Reference(Output(text=bhc_summary), tags=[CORRECT_TAG])], |
| 93 | + split=split, |
| 94 | + ) |
| 95 | + ) |
| 96 | + |
| 97 | + return instances |
0 commit comments