stanford-crfm
diff --git a/‎docs/reproducing_leaderboards.md
+11-1 b/‎docs/reproducing_leaderboards.md
+11-1
diff --git a/‎helm-frontend/project_metadata.json
+6 b/‎helm-frontend/project_metadata.json
+6
diff --git a/‎helm-frontend/src/assets/seahelm/aisingapore.png
65.3 KB b/‎helm-frontend/src/assets/seahelm/aisingapore.png
65.3 KB
diff --git a/‎helm-frontend/src/components/Landing/SEAHELMLanding.tsx
+10-1 b/‎helm-frontend/src/components/Landing/SEAHELMLanding.tsx
+10-1
diff --git a/‎setup.cfg
+2-2 b/‎setup.cfg
+2-2
diff --git a/‎src/helm/benchmark/metrics/bhasa_metrics_specs.py
-10 b/‎src/helm/benchmark/metrics/bhasa_metrics_specs.py
-10
diff --git a/‎src/helm/benchmark/metrics/bhasa_metrics.py ‎src/helm/benchmark/metrics/seahelm_metrics.py
+3-3 b/‎src/helm/benchmark/metrics/bhasa_metrics.py ‎src/helm/benchmark/metrics/seahelm_metrics.py
+3-3
diff --git a/‎src/helm/benchmark/metrics/seahelm_metrics_specs.py
+10 b/‎src/helm/benchmark/metrics/seahelm_metrics_specs.py
+10
diff --git a/‎src/helm/benchmark/presentation/run_entries_bhasa.conf ‎src/helm/benchmark/presentation/run_entries_seahelm.conf
+2-2 b/‎src/helm/benchmark/presentation/run_entries_bhasa.conf ‎src/helm/benchmark/presentation/run_entries_seahelm.conf
+2-2
diff --git a/‎src/helm/benchmark/presentation/run_entries_bhasa_zero_shot.conf ‎src/helm/benchmark/presentation/run_entries_seahelm_zero_shot.conf
+2-2 b/‎src/helm/benchmark/presentation/run_entries_bhasa_zero_shot.conf ‎src/helm/benchmark/presentation/run_entries_seahelm_zero_shot.conf
+2-2
@@ -132,4 +132,14 @@ export SCHEMA_PATH=schema_safety.yaml
 export NUM_TRAIN_TRIALS=1
 export NUM_EVAL_INSTANCES=1000
 export PRIORITY=2
-```
+```
+
+### SEA-HELM
+
+```bash
+export RUN_ENTRIES_CONF_PATH=run_entries_seahelm_zero_shot.conf
+export SCHEMA_PATH=schema_seahelm.yaml
+export NUM_TRAIN_TRIALS=1
+export NUM_EVAL_INSTANCES=1000
+export PRIORITY=2
+```
@@ -66,6 +66,12 @@
 		"releases": ["v1.1.0", "v1.0.0"]
 	},
 	{
+		"title": "SEA-HELM",
+		"description": "Assessment of large language models across various tasks, emphasizing Southeast Asian languages",
+		"id": "seahelm",
+    "releases": ["v1.0.0"]
+  },
+  {
 		"title": "MMLU-Winogrande-Afr",
 		"description": "Clinical MMLU and Winogrande in 11 low-resource African languages",
 		"id": "mmlu-winogrande-afr",
 
@@ -1,6 +1,6 @@
 import MiniLeaderboard from "@/components/MiniLeaderboard";
 import { Link } from "react-router-dom";
-
+import aisingapore from "@/assets/seahelm/aisingapore.png";
 export default function SEAHELMLanding() {
   return (
     <div className="container mx-auto px-16">
@@ -9,6 +9,15 @@ export default function SEAHELMLanding() {
       </h1>
       <div className="flex flex-col lg:flex-row gap-8">
         <div className="flex-1 text-l">
+          <div className="text-center">
+            <a href="https://aisingapore.org/">
+              <img
+                src={aisingapore}
+                alt="Logo"
+                className="inline h-32 mx-4 my-4"
+              />
+            </a>
+          </div>
           With the rapid emergence of novel capabilities in Large Language
           Models (LLMs), the need for rigorous multilingual and multicultural
           benchmarks that are integrated has become more pronounced. Though
 
@@ -114,7 +114,7 @@ mongo =
 unitxt =
     evaluate~=0.4.1
 
-bhasa = 
+seahelm = 
     pythainlp==5.0.0
     pyonmttok==1.37.0
     sacrebleu~=2.2.1
@@ -324,7 +324,7 @@ all =
     crfm-helm[heim]
     crfm-helm[vlm]
     crfm-helm[audiolm]
-    # crfm-helm[bhasa] is excluded because pyonmttok does not support Python 3.12
+    # crfm-helm[seahelm] is excluded because pyonmttok does not support Python 3.12
     # crfm-helm[dev] is excluded because end-users don't need it.
     # crfm-helm[summarize] is excluded because it requires torch<2.0
     # TODO(#2280): Add crfm-helm[summarize] back.
 
@@ -14,7 +14,7 @@
 from helm.benchmark.metrics.statistic import Stat
 
 
-class BhasaMachineTranslationMetric(Metric):
+class SEAHELMMachineTranslationMetric(Metric):
     """Machine Translation Metrics
 
     This class computes the following standard machine translation metrics
@@ -74,8 +74,8 @@ def evaluate_generation(
         return result
 
 
-class BhasaQAMetric(Metric):
-    """Bhasa QA Metrics
+class SEAHELMQAMetric(Metric):
+    """SEAHELM QA Metrics
 
     This class computes the following standard SQuAD v1.1 metrics
 
 
@@ -0,0 +1,10 @@
+from typing import Any, Dict, List
+from helm.benchmark.metrics.metric import MetricSpec
+
+
+def get_seahelm_machine_translation_metric_specs() -> List[MetricSpec]:
+    return [MetricSpec(class_name="helm.benchmark.metrics.seahelm_metrics.SEAHELMMachineTranslationMetric")]
+
+
+def get_seahelm_qa_metric_specs(args: Dict[str, Any]) -> List[MetricSpec]:
+    return [MetricSpec(class_name="helm.benchmark.metrics.seahelm_metrics.SEAHELMQAMetric", args=args)]
@@ -1,8 +1,8 @@
 ##############################
-# BHASA 5-shot Evaluation #
+# SEA-HELM 5-shot Evaluation #
 ##############################
 
-# BHASA Run Specs
+# SEA-HELM Run Specs
 ##   A. Natural Language Understanding
 ##   B. Natural Language Generation
 ##   C. Natural Language Reasoning
 
@@ -1,8 +1,8 @@
 ##############################
-# BHASA Zero-shot Evaluation #
+# SEA-HELM Zero-shot Evaluation #
 ##############################
 
-# BHASA Run Specs
+# SEA-HELM Run Specs
 ##   A. Natural Language Understanding
 ##   B. Natural Language Generation
 ##   C. Natural Language Reasoning