Skip to content

Commit 82f9d58

Browse files
authored
feat: add complete implementation of CLEAR dataset (#3466)
1 parent 7f44dd3 commit 82f9d58

File tree

4 files changed

+189
-39
lines changed

4 files changed

+189
-39
lines changed

src/helm/benchmark/presentation/run_entries_medhelm.conf

+103-6
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,109 @@ entries: [
1212
{description: "medcalc_bench:model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
1313
{description: "medcalc_bench:model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
1414

15-
{description: "clear:model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},
16-
{description: "clear:model=openai/gpt-4o-2024-05-13,model_deployment=stanfordhealthcare/gpt-4o-2024-05-13", priority: 1},
17-
{description: "clear:model=openai/gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare/gpt-4o-mini-2024-07-18", priority: 1},
18-
{description: "clear:model=meta/llama-3.3-70b-instruct,model_deployment=stanfordhealthcare/llama-3.3-70b-instruct", priority: 1},
19-
{description: "clear:model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
20-
{description: "clear:model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
15+
#Alcohol Dependence
16+
{description: "clear:condition=alcohol_dependence,max_eval_instances=100,model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},
17+
{description: "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai/gpt-4o-2024-05-13,model_deployment=stanfordhealthcare/gpt-4o-2024-05-13", priority: 1},
18+
{description: "clear:condition=alcohol_dependence,max_eval_instances=100,model=openai/gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare/gpt-4o-mini-2024-07-18", priority: 1},
19+
{description: "clear:condition=alcohol_dependence,max_eval_instances=100,model=meta/llama-3.3-70b-instruct,model_deployment=stanfordhealthcare/llama-3.3-70b-instruct", priority: 1},
20+
{description: "clear:condition=alcohol_dependence,max_eval_instances=100,model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
21+
{description: "clear:condition=alcohol_dependence,max_eval_instances=100,model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
22+
23+
#Attention Deficit Hyperactivity Disorder
24+
{description: "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},
25+
{description: "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai/gpt-4o-2024-05-13,model_deployment=stanfordhealthcare/gpt-4o-2024-05-13", priority: 1},
26+
{description: "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=openai/gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare/gpt-4o-mini-2024-07-18", priority: 1},
27+
{description: "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=meta/llama-3.3-70b-instruct,model_deployment=stanfordhealthcare/llama-3.3-70b-instruct", priority: 1},
28+
{description: "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
29+
{description: "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
30+
31+
#Bipolar Disorder
32+
{description: "clear:condition=bipolar_disorder,max_eval_instances=100,model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},
33+
{description: "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai/gpt-4o-2024-05-13,model_deployment=stanfordhealthcare/gpt-4o-2024-05-13", priority: 1},
34+
{description: "clear:condition=bipolar_disorder,max_eval_instances=100,model=openai/gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare/gpt-4o-mini-2024-07-18", priority: 1},
35+
{description: "clear:condition=bipolar_disorder,max_eval_instances=100,model=meta/llama-3.3-70b-instruct,model_deployment=stanfordhealthcare/llama-3.3-70b-instruct", priority: 1},
36+
{description: "clear:condition=bipolar_disorder,max_eval_instances=100,model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
37+
{description: "clear:condition=bipolar_disorder,max_eval_instances=100,model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
38+
39+
#Chronic Pain
40+
{description: "clear:condition=chronic_pain,max_eval_instances=100,model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},
41+
{description: "clear:condition=chronic_pain,max_eval_instances=100,model=openai/gpt-4o-2024-05-13,model_deployment=stanfordhealthcare/gpt-4o-2024-05-13", priority: 1},
42+
{description: "clear:condition=chronic_pain,max_eval_instances=100,model=openai/gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare/gpt-4o-mini-2024-07-18", priority: 1},
43+
{description: "clear:condition=chronic_pain,max_eval_instances=100,model=meta/llama-3.3-70b-instruct,model_deployment=stanfordhealthcare/llama-3.3-70b-instruct", priority: 1},
44+
{description: "clear:condition=chronic_pain,max_eval_instances=100,model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
45+
{description: "clear:condition=chronic_pain,max_eval_instances=100,model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
46+
47+
#Homelessness
48+
{description: "clear:condition=homelessness,max_eval_instances=100,model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},
49+
{description: "clear:condition=homelessness,max_eval_instances=100,model=openai/gpt-4o-2024-05-13,model_deployment=stanfordhealthcare/gpt-4o-2024-05-13", priority: 1},
50+
{description: "clear:condition=homelessness,max_eval_instances=100,model=openai/gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare/gpt-4o-mini-2024-07-18", priority: 1},
51+
{description: "clear:condition=homelessness,max_eval_instances=100,model=meta/llama-3.3-70b-instruct,model_deployment=stanfordhealthcare/llama-3.3-70b-instruct", priority: 1},
52+
{description: "clear:condition=homelessness,max_eval_instances=100,model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
53+
{description: "clear:condition=homelessness,max_eval_instances=100,model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
54+
55+
#Liver Disease
56+
{description: "clear:condition=liver_disease,max_eval_instances=100,model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},
57+
{description: "clear:condition=liver_disease,max_eval_instances=100,model=openai/gpt-4o-2024-05-13,model_deployment=stanfordhealthcare/gpt-4o-2024-05-13", priority: 1},
58+
{description: "clear:condition=liver_disease,max_eval_instances=100,model=openai/gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare/gpt-4o-mini-2024-07-18", priority: 1},
59+
{description: "clear:condition=liver_disease,max_eval_instances=100,model=meta/llama-3.3-70b-instruct,model_deployment=stanfordhealthcare/llama-3.3-70b-instruct", priority: 1},
60+
{description: "clear:condition=liver_disease,max_eval_instances=100,model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
61+
{description: "clear:condition=liver_disease,max_eval_instances=100,model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
62+
63+
#Major Depression
64+
{description: "clear:condition=major_depression,max_eval_instances=100,model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},
65+
{description: "clear:condition=major_depression,max_eval_instances=100,model=openai/gpt-4o-2024-05-13,model_deployment=stanfordhealthcare/gpt-4o-2024-05-13", priority: 1},
66+
{description: "clear:condition=major_depression,max_eval_instances=100,model=openai/gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare/gpt-4o-mini-2024-07-18", priority: 1},
67+
{description: "clear:condition=major_depression,max_eval_instances=100,model=meta/llama-3.3-70b-instruct,model_deployment=stanfordhealthcare/llama-3.3-70b-instruct", priority: 1},
68+
{description: "clear:condition=major_depression,max_eval_instances=100,model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
69+
{description: "clear:condition=major_depression,max_eval_instances=100,model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
70+
71+
#Personality Disorder
72+
{description: "clear:condition=personality_disorder,max_eval_instances=100,model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},
73+
{description: "clear:condition=personality_disorder,max_eval_instances=100,model=openai/gpt-4o-2024-05-13,model_deployment=stanfordhealthcare/gpt-4o-2024-05-13", priority: 1},
74+
{description: "clear:condition=personality_disorder,max_eval_instances=100,model=openai/gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare/gpt-4o-mini-2024-07-18", priority: 1},
75+
{description: "clear:condition=personality_disordern,max_eval_instances=100,model=meta/llama-3.3-70b-instruct,model_deployment=stanfordhealthcare/llama-3.3-70b-instruct", priority: 1},
76+
{description: "clear:condition=personality_disorder,max_eval_instances=100,model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
77+
{description: "clear:condition=personality_disorder,max_eval_instances=100,model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
78+
79+
#Post-Traumatic Stress Disorder
80+
{description: "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},
81+
{description: "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai/gpt-4o-2024-05-13,model_deployment=stanfordhealthcare/gpt-4o-2024-05-13", priority: 1},
82+
{description: "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=openai/gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare/gpt-4o-mini-2024-07-18", priority: 1},
83+
{description: "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=meta/llama-3.3-70b-instruct,model_deployment=stanfordhealthcare/llama-3.3-70b-instruct", priority: 1},
84+
{description: "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
85+
{description: "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
86+
87+
#Substance Use Disorder
88+
{description: "clear:condition=substance_use_disorder,max_eval_instances=100,model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},
89+
{description: "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai/gpt-4o-2024-05-13,model_deployment=stanfordhealthcare/gpt-4o-2024-05-13", priority: 1},
90+
{description: "clear:condition=substance_use_disorder,max_eval_instances=100,model=openai/gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare/gpt-4o-mini-2024-07-18", priority: 1},
91+
{description: "clear:condition=substance_use_disorder,max_eval_instances=100,model=meta/llama-3.3-70b-instruct,model_deployment=stanfordhealthcare/llama-3.3-70b-instruct", priority: 1},
92+
{description: "clear:condition=substance_use_disorder,max_eval_instances=100,model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
93+
{description: "clear:condition=substance_use_disorder,max_eval_instances=100,model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
94+
95+
#Suicidal Behavior
96+
{description: "clear:condition=suicidal_behavior,max_eval_instances=100,model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},
97+
{description: "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai/gpt-4o-2024-05-13,model_deployment=stanfordhealthcare/gpt-4o-2024-05-13", priority: 1},
98+
{description: "clear:condition=suicidal_behavior,max_eval_instances=100,model=openai/gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare/gpt-4o-mini-2024-07-18", priority: 1},
99+
{description: "clear:condition=suicidal_behavior,max_eval_instances=100,model=meta/llama-3.3-70b-instruct,model_deployment=stanfordhealthcare/llama-3.3-70b-instruct", priority: 1},
100+
{description: "clear:condition=suicidal_behavior,max_eval_instances=100,model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
101+
{description: "clear:condition=suicidal_behavior,max_eval_instances=100,model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
102+
103+
#Tobacco Dependence
104+
{description: "clear:condition=tobacco_dependence,max_eval_instances=100,model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},
105+
{description: "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai/gpt-4o-2024-05-13,model_deployment=stanfordhealthcare/gpt-4o-2024-05-13", priority: 1},
106+
{description: "clear:condition=tobacco_dependence,max_eval_instances=100,model=openai/gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare/gpt-4o-mini-2024-07-18", priority: 1},
107+
{description: "clear:condition=tobacco_dependence,max_eval_instances=100,model=meta/llama-3.3-70b-instruct,model_deployment=stanfordhealthcare/llama-3.3-70b-instruct", priority: 1},
108+
{description: "clear:condition=tobacco_dependence,max_eval_instances=100,model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
109+
{description: "clear:condition=tobacco_dependence,max_eval_instances=100,model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
110+
111+
#Unemployment
112+
{description: "clear:condition=unemployment,max_eval_instances=100,model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},
113+
{description: "clear:condition=unemployment,max_eval_instances=100,model=openai/gpt-4o-2024-05-13,model_deployment=stanfordhealthcare/gpt-4o-2024-05-13", priority: 1},
114+
{description: "clear:condition=unemployment,max_eval_instances=100,model=openai/gpt-4o-mini-2024-07-18,model_deployment=stanfordhealthcare/gpt-4o-mini-2024-07-18", priority: 1},
115+
{description: "clear:condition=unemployment,max_eval_instances=100,model=meta/llama-3.3-70b-instruct,model_deployment=stanfordhealthcare/llama-3.3-70b-instruct", priority: 1},
116+
{description: "clear:condition=unemployment,max_eval_instances=100,model=qwen/qwen2.5-7b-instruct,model_deployment=huggingface/qwen2.5-7b-instruct-4bit", priority: 1},
117+
{description: "clear:condition=unemployment,max_eval_instances=100,model=microsoft/phi-3.5-mini-instruct,model_deployment=huggingface/phi-3.5-mini-instruct-4bit", priority: 1},
21118

22119
### Planning Treatments ###
23120
{description: "mtsamples_replicate:model=google/gemini-1.5-pro-001,model_deployment=stanfordhealthcare/gemini-1.5-pro-001", priority: 1},

src/helm/benchmark/run_specs/medhelm_run_specs.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,19 @@ def get_medcalc_bench_spec() -> RunSpec:
5656

5757

5858
@run_spec_function("clear")
59-
def get_clear_spec() -> RunSpec:
60-
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.clear_scenario.CLEARScenario", args={})
59+
def get_clear_spec(condition: str) -> RunSpec:
60+
scenario_spec = ScenarioSpec(
61+
class_name="helm.benchmark.scenarios.clear_scenario.CLEARScenario", args={"condition": condition}
62+
)
63+
64+
condition_display = condition.replace("_", " ")
6165

6266
adapter_spec = get_multiple_choice_adapter_spec(
6367
method=ADAPT_MULTIPLE_CHOICE_JOINT,
6468
instructions=(
65-
"Answer 'A' for 'Has a history of alcohol dependence', "
66-
"'B' for 'Does not have a history of alcohol dependence;, or 'C' for 'Uncertain'"
69+
f"Answer 'A' for 'Has a history of {condition_display}', "
70+
f"'B' for 'Does not have a history of {condition_display}', or "
71+
f"'C' for 'Uncertain'"
6772
),
6873
input_noun=None,
6974
output_noun="Respond only with 'A', 'B', or 'C'. Do not add any other text, punctuation, or symbols",
@@ -72,7 +77,7 @@ def get_clear_spec() -> RunSpec:
7277
)
7378

7479
return RunSpec(
75-
name="clear",
80+
name=f"clear:condition={condition}",
7681
scenario_spec=scenario_spec,
7782
adapter_spec=adapter_spec,
7883
metric_specs=get_exact_match_metric_specs(),

0 commit comments

Comments
 (0)