Skip to content

Commit 62d9e8a

Browse files
Add SEA-HELM leaderboard and SEA-LIONv3 models (#3347)
1 parent c7d4866 commit 62d9e8a

15 files changed

+234
-91
lines changed

docs/reproducing_leaderboards.md

+11-1
Original file line numberDiff line numberDiff line change
@@ -132,4 +132,14 @@ export SCHEMA_PATH=schema_safety.yaml
132132
export NUM_TRAIN_TRIALS=1
133133
export NUM_EVAL_INSTANCES=1000
134134
export PRIORITY=2
135-
```
135+
```
136+
137+
### SEA-HELM
138+
139+
```bash
140+
export RUN_ENTRIES_CONF_PATH=run_entries_seahelm_zero_shot.conf
141+
export SCHEMA_PATH=schema_seahelm.yaml
142+
export NUM_TRAIN_TRIALS=1
143+
export NUM_EVAL_INSTANCES=1000
144+
export PRIORITY=2
145+
```

helm-frontend/project_metadata.json

+6
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,12 @@
6666
"releases": ["v1.1.0", "v1.0.0"]
6767
},
6868
{
69+
"title": "SEA-HELM",
70+
"description": "Assessment of large language models across various tasks, emphasizing Southeast Asian languages",
71+
"id": "seahelm",
72+
"releases": ["v1.0.0"]
73+
},
74+
{
6975
"title": "MMLU-Winogrande-Afr",
7076
"description": "Clinical MMLU and Winogrande in 11 low-resource African languages",
7177
"id": "mmlu-winogrande-afr",
65.3 KB
Loading

helm-frontend/src/components/Landing/SEAHELMLanding.tsx

+10-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import MiniLeaderboard from "@/components/MiniLeaderboard";
22
import { Link } from "react-router-dom";
3-
3+
import aisingapore from "@/assets/seahelm/aisingapore.png";
44
export default function SEAHELMLanding() {
55
return (
66
<div className="container mx-auto px-16">
@@ -9,6 +9,15 @@ export default function SEAHELMLanding() {
99
</h1>
1010
<div className="flex flex-col lg:flex-row gap-8">
1111
<div className="flex-1 text-l">
12+
<div className="text-center">
13+
<a href="https://aisingapore.org/">
14+
<img
15+
src={aisingapore}
16+
alt="Logo"
17+
className="inline h-32 mx-4 my-4"
18+
/>
19+
</a>
20+
</div>
1221
With the rapid emergence of novel capabilities in Large Language
1322
Models (LLMs), the need for rigorous multilingual and multicultural
1423
benchmarks that are integrated has become more pronounced. Though

setup.cfg

+2-2
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ mongo =
114114
unitxt =
115115
evaluate~=0.4.1
116116

117-
bhasa =
117+
seahelm =
118118
pythainlp==5.0.0
119119
pyonmttok==1.37.0
120120
sacrebleu~=2.2.1
@@ -324,7 +324,7 @@ all =
324324
crfm-helm[heim]
325325
crfm-helm[vlm]
326326
crfm-helm[audiolm]
327-
# crfm-helm[bhasa] is excluded because pyonmttok does not support Python 3.12
327+
# crfm-helm[seahelm] is excluded because pyonmttok does not support Python 3.12
328328
# crfm-helm[dev] is excluded because end-users don't need it.
329329
# crfm-helm[summarize] is excluded because it requires torch<2.0
330330
# TODO(#2280): Add crfm-helm[summarize] back.

src/helm/benchmark/metrics/bhasa_metrics_specs.py

-10
This file was deleted.

src/helm/benchmark/metrics/bhasa_metrics.py src/helm/benchmark/metrics/seahelm_metrics.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from helm.benchmark.metrics.statistic import Stat
1515

1616

17-
class BhasaMachineTranslationMetric(Metric):
17+
class SEAHELMMachineTranslationMetric(Metric):
1818
"""Machine Translation Metrics
1919
2020
This class computes the following standard machine translation metrics
@@ -74,8 +74,8 @@ def evaluate_generation(
7474
return result
7575

7676

77-
class BhasaQAMetric(Metric):
78-
"""Bhasa QA Metrics
77+
class SEAHELMQAMetric(Metric):
78+
"""SEAHELM QA Metrics
7979
8080
This class computes the following standard SQuAD v1.1 metrics
8181
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from typing import Any, Dict, List
2+
from helm.benchmark.metrics.metric import MetricSpec
3+
4+
5+
def get_seahelm_machine_translation_metric_specs() -> List[MetricSpec]:
6+
return [MetricSpec(class_name="helm.benchmark.metrics.seahelm_metrics.SEAHELMMachineTranslationMetric")]
7+
8+
9+
def get_seahelm_qa_metric_specs(args: Dict[str, Any]) -> List[MetricSpec]:
10+
return [MetricSpec(class_name="helm.benchmark.metrics.seahelm_metrics.SEAHELMQAMetric", args=args)]

src/helm/benchmark/presentation/run_entries_bhasa.conf src/helm/benchmark/presentation/run_entries_seahelm.conf

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
##############################
2-
# BHASA 5-shot Evaluation #
2+
# SEA-HELM 5-shot Evaluation #
33
##############################
44

5-
# BHASA Run Specs
5+
# SEA-HELM Run Specs
66
## A. Natural Language Understanding
77
## B. Natural Language Generation
88
## C. Natural Language Reasoning

src/helm/benchmark/presentation/run_entries_bhasa_zero_shot.conf src/helm/benchmark/presentation/run_entries_seahelm_zero_shot.conf

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
##############################
2-
# BHASA Zero-shot Evaluation #
2+
# SEA-HELM Zero-shot Evaluation #
33
##############################
44

5-
# BHASA Run Specs
5+
# SEA-HELM Run Specs
66
## A. Natural Language Understanding
77
## B. Natural Language Generation
88
## C. Natural Language Reasoning

0 commit comments

Comments
 (0)