MUStARDScenario for sarcasm detection (#3345)

teetone · web-flow · commit a3cb6b1e8f8b · 2025-02-16T20:12:16.000-08:00
diff --git a/setup.cfg b/setup.cfg
@@ -285,6 +285,9 @@ audiolm =
     # For clipping audio
     pydub~=0.25.1
 
+    # For extracting audio from videos
+    ffmpeg-python~=0.2.0
+
     # For HuggingFace audio datasets
     soundfile~=0.12
     librosa~=0.10
diff --git a/src/helm/benchmark/presentation/run_entries_audio.conf b/src/helm/benchmark/presentation/run_entries_audio.conf
@@ -2,7 +2,6 @@ entries: [
     ####################################################################################################################
     # Auditory Perception
     ####################################################################################################################
-    {description: "meld_audio:model=audiolm", priority: 1}
     {description: "vocal_sound:model=audiolm", priority: 1}
     {description: "audiocaps:model=audiolm", priority: 1}
     {description: "voxceleb2:model=audiolm", priority: 1}
@@ -11,6 +10,12 @@ entries: [
     {description: "air_bench_chat:subject=music,model=audiolm", priority: 1}
     {description: "air_bench_chat:subject=mix,model=audiolm", priority: 1}
 
+    ####################################################################################################################
+    # Emotion detection
+    ####################################################################################################################
+    {description: "meld_audio:model=audiolm", priority: 1}
+    {description: "mustard:model=audiolm", priority: 1}
+
     ####################################################################################################################
     # Robustness
     ####################################################################################################################
diff --git a/src/helm/benchmark/presentation/run_entries_audio_debug.conf b/src/helm/benchmark/presentation/run_entries_audio_debug.conf
@@ -1,3 +1,3 @@
 entries: [
-    {description: "mutox:language=Arabic,model=audiolm", priority: 1}
+    {description: "mustard:model=audiolm", priority: 1}
 ]
diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py
@@ -183,6 +183,23 @@ def get_mutox_audio_run_spec(language: str) -> RunSpec:
     )
 
 
+@run_spec_function("mustard")
+def get_mustard_audio_run_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.audio_language.mustard_scenario.MUStARDScenario")
+    adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
+        input_noun=None, output_noun="Answer", max_train_instances=0
+    )
+    metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
+    run_spec_name: str = "mustard"
+    return RunSpec(
+        name=run_spec_name,
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
+
+
 @run_spec_function("covost2")
 def get_covost2_run_spec(source_language: str, target_language: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
diff --git a/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py b/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py
@@ -15,7 +15,7 @@
 )
 from helm.common.media_object import MediaObject, MultimediaObject
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
-from helm.common.audio_utils import use_ffmpeg_to_extract_audio_from_video
+from helm.common.audio_utils import extract_audio
 
 
 class CasualConversations2Scenario(Scenario):
@@ -123,7 +123,9 @@ def get_instances(self, output_path: str) -> List[Instance]:
             if file_name.endswith(".mp4"):
                 local_audio_path: str = os.path.join(audio_file_folder, file_name.replace(".mp4", ".mp3"))
                 local_video_path: str = os.path.join(data_dir, file_name)
-                use_ffmpeg_to_extract_audio_from_video(local_video_path, local_audio_path)
+
+                if not os.path.exists(local_audio_path):
+                    extract_audio(local_video_path, local_audio_path)
                 assert os.path.exists(local_audio_path), f"Audio file does not exist at path: {local_audio_path}"
 
                 subject_answer = audio_scripts[file_name][self._subject]
diff --git a/src/helm/benchmark/scenarios/audio_language/mustard_scenario.py b/src/helm/benchmark/scenarios/audio_language/mustard_scenario.py
@@ -0,0 +1,142 @@
+import json
+import os
+from typing import List
+
+from tqdm import tqdm
+
+from helm.benchmark.scenarios.scenario import (
+    TEST_SPLIT,
+    Scenario,
+    Instance,
+    Reference,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.audio_utils import is_invalid_audio_file, extract_audio
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+
+
+class MUStARDScenario(Scenario):
+    """
+    MUStARD: Multimodal Sarcasm Detection Dataset
+
+    A multimodal video corpus for research in automated sarcasm discovery. The dataset is compiled from popular
+    TV shows including Friends, The Golden Girls, The Big Bang Theory, and Sarcasmaholics Anonymous. MUStARD consists
+    of audiovisual utterances annotated with sarcasm labels. Each utterance is accompanied by its context, providing
+    additional information on the scenario where it occurs.
+
+    We just extract the audio from the given videos.
+
+    The columns of the dataset are:
+    - utterance: The text of the target utterance to classify.
+    - speaker: Speaker of the target utterance.
+    - context: List of utterances (in chronological order) preceding the target utterance.
+    - context_speakers: Respective speakers of the context utterances.
+    - sarcasm: Binary label for sarcasm tag.
+
+    More specifically an example looks like this:
+
+    "1_60": {
+        "utterance": "It's just a privilege to watch your mind at work.",
+        "speaker": "SHELDON",
+        "context": [
+            "I never would have identified the fingerprints of string theory in the aftermath of the Big Bang.",
+            "My apologies. What's your plan?"
+        ],
+        "context_speakers": [
+            "LEONARD",
+            "SHELDON"
+        ],
+        "show": "BBT",
+        "sarcasm": true
+    }
+
+    The key is the video id.
+
+    The video folder has two subfolders:
+    - context_final: Contains the context videos (e.g., 1_60_c.mp4)
+    - utterances_final: Contains the target utterance videos (e.g., 1_60.mp4)
+
+    Citation:
+
+    @inproceedings{mustard,
+        title = "Towards Multimodal Sarcasm Detection (An \_Obviously\_ Perfect Paper)",
+        author = "Castro, Santiago  and
+          Hazarika, Devamanyu  and
+          P{\'e}rez-Rosas, Ver{\'o}nica  and
+          Zimmermann, Roger  and
+          Mihalcea, Rada  and
+          Poria, Soujanya",
+        booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics
+                    (Volume 1: Long Papers)",
+        month = "7",
+        year = "2019",
+        address = "Florence, Italy",
+        publisher = "Association for Computational Linguistics",
+    }
+    """
+
+    RAW_VIDEO_CLIPS_URL: str = "https://huggingface.co/datasets/MichiganNLP/MUStARD/resolve/main/mmsd_raw_data.zip"
+    ANNOTATIONS_URL: str = (
+        "https://raw.githubusercontent.com/soujanyaporia/MUStARD/refs/heads/master/data/" "sarcasm_data.json"
+    )
+
+    name = "mustard"
+    description = "Sarcasm detection benchmark ([Castro et al, 2018](https://arxiv.org/abs/1906.01815))."
+    tags = ["audio", "classification", "toxicity", "sarcasm detection"]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the annotations
+        annotations_path: str = os.path.join(output_path, "sarcasm_data.json")
+        ensure_file_downloaded(self.ANNOTATIONS_URL, annotations_path)
+
+        # Where the video files will be downloaded to
+        video_path: str = os.path.join(output_path, "video")
+        ensure_file_downloaded(self.RAW_VIDEO_CLIPS_URL, video_path, unpack=True)
+
+        # Where the audio files will be extracted to
+        audio_path: str = os.path.join(output_path, "audio")
+        ensure_directory_exists(audio_path)
+
+        instances: List[Instance] = []
+        annotations = json.load(open(annotations_path, "r"))
+        for key, row in tqdm(annotations.items()):
+            # Extract the audio from the context video
+            context_audio_path: str = os.path.join(audio_path, f"{key}_c.mp3")
+            if not os.path.exists(context_audio_path):
+                # Extract the audio from the video
+                context_video_path: str = os.path.join(video_path, "context_final", f"{key}_c.mp4")
+                extract_audio(context_video_path, context_audio_path)
+            assert not is_invalid_audio_file(context_audio_path), f"Invalid audio file: {context_audio_path}"
+
+            # Extract the audio from the target utterance video
+            utterance_audio_path: str = os.path.join(audio_path, f"{key}.mp3")
+            if not os.path.exists(utterance_audio_path):
+                utterance_video_path: str = os.path.join(video_path, "utterances_final", f"{key}.mp4")
+                extract_audio(utterance_video_path, utterance_audio_path)
+            assert not is_invalid_audio_file(utterance_audio_path), f"Invalid audio file: {utterance_audio_path}"
+
+            input = Input(
+                multimedia_content=MultimediaObject(
+                    media_objects=[
+                        # Input both the context and the utterance audio
+                        MediaObject(text="Context:", content_type="text/plain"),
+                        MediaObject(location=context_audio_path, content_type="audio/mpeg"),
+                        MediaObject(text="Utterance:", content_type="text/plain"),
+                        MediaObject(location=utterance_audio_path, content_type="audio/mpeg"),
+                        MediaObject(
+                            text="Given the context, does the utterance contain sarcasm?", content_type="text/plain"
+                        ),
+                    ]
+                )
+            )
+            is_sarcastic: bool = row["sarcasm"]
+            references = [
+                Reference(Output(text="Yes"), tags=[CORRECT_TAG] if is_sarcastic else []),
+                Reference(Output(text="No"), tags=[CORRECT_TAG] if not is_sarcastic else []),
+            ]
+            instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
+
+        return instances
diff --git a/src/helm/benchmark/static/schema_audio.yaml b/src/helm/benchmark/static/schema_audio.yaml
@@ -211,7 +211,6 @@ run_groups:
     description: Audio Scenarios
     category: All scenarios
     subgroups:
-      - audio_mnist
       - covost2
       - vocal_sound
       - fleurs
@@ -221,6 +220,7 @@ run_groups:
       - meld_audio
       - air_bench_chat
       - mutox
+      - mustard
 
   - name: audio_mnist
     display_name: AudioMNIST
@@ -523,4 +523,23 @@ run_groups:
       what: samples of utterances
       who: real speakers
       when: "2024"
-      language: 30 langguages
+      language: 30 languages
+
+  - name: mustard
+    display_name: MUStARD
+    description: >
+      A multimodal video corpus for research in automated sarcasm discovery. The dataset is compiled from popular
+      TV shows including Friends, The Golden Girls, The Big Bang Theory, and Sarcasmaholics Anonymous. 
+      ([Castro et al, 2019](https://arxiv.org/abs/1906.01815)).
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: sarcasm detection
+      what: clips from television shows
+      who: real speakers
+      when: "2019"
+      language: English
diff --git a/src/helm/common/audio_utils.py b/src/helm/common/audio_utils.py
@@ -3,10 +3,12 @@
 from typing import Optional
 from filelock import FileLock
 
+import ffmpeg
 import numpy as np
 import soundfile as sf
 import subprocess
 
+from helm.common.hierarchical_logger import hlog
 from helm.common.multimodal_request_utils import get_contents_as_bytes
 from helm.common.optional_dependencies import handle_module_not_found_error
 
@@ -55,15 +57,6 @@ def use_ffmpeg_to_convert_audio_file(input_path: str, output_path: str) -> None:
         raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to convert audio files.")
 
 
-def use_ffmpeg_to_extract_audio_from_video(input_video_path: str, output_audio_path: str) -> None:
-    if os.path.exists(output_audio_path):
-        return
-    try:
-        subprocess.run(["ffmpeg", "-i", input_video_path, "-q:a", "0", "-map", "a", output_audio_path], check=True)
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to extract audio files.")
-
-
 def is_invalid_audio_file(audio_path: str) -> bool:
     """
     Two conditions for an audio file to be considered invalid:
@@ -78,3 +71,25 @@ def is_invalid_audio_file(audio_path: str) -> bool:
             return len(audio_file) == 0
     except RuntimeError:
         return True
+
+
+def extract_audio(video_path: str, output_audio_path: str) -> None:
+    """
+    Extracts audio from an MP4 video file and saves it as an MP3 file.
+
+    Args:
+        video_path (str): Path to the input MP4 video file.
+        output_audio_path (str): Path to save the extracted MP3 audio file.
+
+    Returns:
+        None
+    """
+    try:
+        (
+            ffmpeg.input(video_path)
+            .output(output_audio_path, format="mp3", acodec="libmp3lame", audio_bitrate="192k")
+            .run(overwrite_output=True)
+        )
+    except ffmpeg.Error as e:
+        hlog(f"Error extracting audio from video: {video_path}: {e.stderr.decode()}")
+        raise e

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`entries: [`
`2`		`- {description: "mutox:language=Arabic,model=audiolm", priority: 1}`
	`2`	`+ {description: "mustard:model=audiolm", priority: 1}`
`3`	`3`	`]`