Skip to content

Commit a3cb6b1

Browse files
authored
MUStARDScenario for sarcasm detection (#3345)
1 parent 6552e67 commit a3cb6b1

File tree

8 files changed

+218
-15
lines changed

8 files changed

+218
-15
lines changed

setup.cfg

+3
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,9 @@ audiolm =
285285
# For clipping audio
286286
pydub~=0.25.1
287287

288+
# For extracting audio from videos
289+
ffmpeg-python~=0.2.0
290+
288291
# For HuggingFace audio datasets
289292
soundfile~=0.12
290293
librosa~=0.10

src/helm/benchmark/presentation/run_entries_audio.conf

+6-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ entries: [
22
####################################################################################################################
33
# Auditory Perception
44
####################################################################################################################
5-
{description: "meld_audio:model=audiolm", priority: 1}
65
{description: "vocal_sound:model=audiolm", priority: 1}
76
{description: "audiocaps:model=audiolm", priority: 1}
87
{description: "voxceleb2:model=audiolm", priority: 1}
@@ -11,6 +10,12 @@ entries: [
1110
{description: "air_bench_chat:subject=music,model=audiolm", priority: 1}
1211
{description: "air_bench_chat:subject=mix,model=audiolm", priority: 1}
1312

13+
####################################################################################################################
14+
# Emotion detection
15+
####################################################################################################################
16+
{description: "meld_audio:model=audiolm", priority: 1}
17+
{description: "mustard:model=audiolm", priority: 1}
18+
1419
####################################################################################################################
1520
# Robustness
1621
####################################################################################################################
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
entries: [
2-
{description: "mutox:language=Arabic,model=audiolm", priority: 1}
2+
{description: "mustard:model=audiolm", priority: 1}
33
]

src/helm/benchmark/run_specs/audio_run_specs.py

+17
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,23 @@ def get_mutox_audio_run_spec(language: str) -> RunSpec:
183183
)
184184

185185

186+
@run_spec_function("mustard")
187+
def get_mustard_audio_run_spec() -> RunSpec:
188+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.audio_language.mustard_scenario.MUStARDScenario")
189+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
190+
input_noun=None, output_noun="Answer", max_train_instances=0
191+
)
192+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
193+
run_spec_name: str = "mustard"
194+
return RunSpec(
195+
name=run_spec_name,
196+
scenario_spec=scenario_spec,
197+
adapter_spec=adapter_spec,
198+
metric_specs=metric_specs,
199+
groups=[run_spec_name],
200+
)
201+
202+
186203
@run_spec_function("covost2")
187204
def get_covost2_run_spec(source_language: str, target_language: str) -> RunSpec:
188205
scenario_spec = ScenarioSpec(

src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
)
1616
from helm.common.media_object import MediaObject, MultimediaObject
1717
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
18-
from helm.common.audio_utils import use_ffmpeg_to_extract_audio_from_video
18+
from helm.common.audio_utils import extract_audio
1919

2020

2121
class CasualConversations2Scenario(Scenario):
@@ -123,7 +123,9 @@ def get_instances(self, output_path: str) -> List[Instance]:
123123
if file_name.endswith(".mp4"):
124124
local_audio_path: str = os.path.join(audio_file_folder, file_name.replace(".mp4", ".mp3"))
125125
local_video_path: str = os.path.join(data_dir, file_name)
126-
use_ffmpeg_to_extract_audio_from_video(local_video_path, local_audio_path)
126+
127+
if not os.path.exists(local_audio_path):
128+
extract_audio(local_video_path, local_audio_path)
127129
assert os.path.exists(local_audio_path), f"Audio file does not exist at path: {local_audio_path}"
128130

129131
subject_answer = audio_scripts[file_name][self._subject]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
import json
2+
import os
3+
from typing import List
4+
5+
from tqdm import tqdm
6+
7+
from helm.benchmark.scenarios.scenario import (
8+
TEST_SPLIT,
9+
Scenario,
10+
Instance,
11+
Reference,
12+
CORRECT_TAG,
13+
Input,
14+
Output,
15+
)
16+
from helm.common.audio_utils import is_invalid_audio_file, extract_audio
17+
from helm.common.media_object import MediaObject, MultimediaObject
18+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
19+
20+
21+
class MUStARDScenario(Scenario):
22+
"""
23+
MUStARD: Multimodal Sarcasm Detection Dataset
24+
25+
A multimodal video corpus for research in automated sarcasm discovery. The dataset is compiled from popular
26+
TV shows including Friends, The Golden Girls, The Big Bang Theory, and Sarcasmaholics Anonymous. MUStARD consists
27+
of audiovisual utterances annotated with sarcasm labels. Each utterance is accompanied by its context, providing
28+
additional information on the scenario where it occurs.
29+
30+
We just extract the audio from the given videos.
31+
32+
The columns of the dataset are:
33+
- utterance: The text of the target utterance to classify.
34+
- speaker: Speaker of the target utterance.
35+
- context: List of utterances (in chronological order) preceding the target utterance.
36+
- context_speakers: Respective speakers of the context utterances.
37+
- sarcasm: Binary label for sarcasm tag.
38+
39+
More specifically an example looks like this:
40+
41+
"1_60": {
42+
"utterance": "It's just a privilege to watch your mind at work.",
43+
"speaker": "SHELDON",
44+
"context": [
45+
"I never would have identified the fingerprints of string theory in the aftermath of the Big Bang.",
46+
"My apologies. What's your plan?"
47+
],
48+
"context_speakers": [
49+
"LEONARD",
50+
"SHELDON"
51+
],
52+
"show": "BBT",
53+
"sarcasm": true
54+
}
55+
56+
The key is the video id.
57+
58+
The video folder has two subfolders:
59+
- context_final: Contains the context videos (e.g., 1_60_c.mp4)
60+
- utterances_final: Contains the target utterance videos (e.g., 1_60.mp4)
61+
62+
Citation:
63+
64+
@inproceedings{mustard,
65+
title = "Towards Multimodal Sarcasm Detection (An \_Obviously\_ Perfect Paper)",
66+
author = "Castro, Santiago and
67+
Hazarika, Devamanyu and
68+
P{\'e}rez-Rosas, Ver{\'o}nica and
69+
Zimmermann, Roger and
70+
Mihalcea, Rada and
71+
Poria, Soujanya",
72+
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics
73+
(Volume 1: Long Papers)",
74+
month = "7",
75+
year = "2019",
76+
address = "Florence, Italy",
77+
publisher = "Association for Computational Linguistics",
78+
}
79+
"""
80+
81+
RAW_VIDEO_CLIPS_URL: str = "https://huggingface.co/datasets/MichiganNLP/MUStARD/resolve/main/mmsd_raw_data.zip"
82+
ANNOTATIONS_URL: str = (
83+
"https://raw.githubusercontent.com/soujanyaporia/MUStARD/refs/heads/master/data/" "sarcasm_data.json"
84+
)
85+
86+
name = "mustard"
87+
description = "Sarcasm detection benchmark ([Castro et al, 2018](https://arxiv.org/abs/1906.01815))."
88+
tags = ["audio", "classification", "toxicity", "sarcasm detection"]
89+
90+
def get_instances(self, output_path: str) -> List[Instance]:
91+
# Download the annotations
92+
annotations_path: str = os.path.join(output_path, "sarcasm_data.json")
93+
ensure_file_downloaded(self.ANNOTATIONS_URL, annotations_path)
94+
95+
# Where the video files will be downloaded to
96+
video_path: str = os.path.join(output_path, "video")
97+
ensure_file_downloaded(self.RAW_VIDEO_CLIPS_URL, video_path, unpack=True)
98+
99+
# Where the audio files will be extracted to
100+
audio_path: str = os.path.join(output_path, "audio")
101+
ensure_directory_exists(audio_path)
102+
103+
instances: List[Instance] = []
104+
annotations = json.load(open(annotations_path, "r"))
105+
for key, row in tqdm(annotations.items()):
106+
# Extract the audio from the context video
107+
context_audio_path: str = os.path.join(audio_path, f"{key}_c.mp3")
108+
if not os.path.exists(context_audio_path):
109+
# Extract the audio from the video
110+
context_video_path: str = os.path.join(video_path, "context_final", f"{key}_c.mp4")
111+
extract_audio(context_video_path, context_audio_path)
112+
assert not is_invalid_audio_file(context_audio_path), f"Invalid audio file: {context_audio_path}"
113+
114+
# Extract the audio from the target utterance video
115+
utterance_audio_path: str = os.path.join(audio_path, f"{key}.mp3")
116+
if not os.path.exists(utterance_audio_path):
117+
utterance_video_path: str = os.path.join(video_path, "utterances_final", f"{key}.mp4")
118+
extract_audio(utterance_video_path, utterance_audio_path)
119+
assert not is_invalid_audio_file(utterance_audio_path), f"Invalid audio file: {utterance_audio_path}"
120+
121+
input = Input(
122+
multimedia_content=MultimediaObject(
123+
media_objects=[
124+
# Input both the context and the utterance audio
125+
MediaObject(text="Context:", content_type="text/plain"),
126+
MediaObject(location=context_audio_path, content_type="audio/mpeg"),
127+
MediaObject(text="Utterance:", content_type="text/plain"),
128+
MediaObject(location=utterance_audio_path, content_type="audio/mpeg"),
129+
MediaObject(
130+
text="Given the context, does the utterance contain sarcasm?", content_type="text/plain"
131+
),
132+
]
133+
)
134+
)
135+
is_sarcastic: bool = row["sarcasm"]
136+
references = [
137+
Reference(Output(text="Yes"), tags=[CORRECT_TAG] if is_sarcastic else []),
138+
Reference(Output(text="No"), tags=[CORRECT_TAG] if not is_sarcastic else []),
139+
]
140+
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
141+
142+
return instances

src/helm/benchmark/static/schema_audio.yaml

+21-2
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,6 @@ run_groups:
211211
description: Audio Scenarios
212212
category: All scenarios
213213
subgroups:
214-
- audio_mnist
215214
- covost2
216215
- vocal_sound
217216
- fleurs
@@ -221,6 +220,7 @@ run_groups:
221220
- meld_audio
222221
- air_bench_chat
223222
- mutox
223+
- mustard
224224

225225
- name: audio_mnist
226226
display_name: AudioMNIST
@@ -523,4 +523,23 @@ run_groups:
523523
what: samples of utterances
524524
who: real speakers
525525
when: "2024"
526-
language: 30 langguages
526+
language: 30 languages
527+
528+
- name: mustard
529+
display_name: MUStARD
530+
description: >
531+
A multimodal video corpus for research in automated sarcasm discovery. The dataset is compiled from popular
532+
TV shows including Friends, The Golden Girls, The Big Bang Theory, and Sarcasmaholics Anonymous.
533+
([Castro et al, 2019](https://arxiv.org/abs/1906.01815)).
534+
metric_groups:
535+
- accuracy
536+
- general_information
537+
environment:
538+
main_name: exact_match
539+
main_split: test
540+
taxonomy:
541+
task: sarcasm detection
542+
what: clips from television shows
543+
who: real speakers
544+
when: "2019"
545+
language: English

src/helm/common/audio_utils.py

+24-9
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@
33
from typing import Optional
44
from filelock import FileLock
55

6+
import ffmpeg
67
import numpy as np
78
import soundfile as sf
89
import subprocess
910

11+
from helm.common.hierarchical_logger import hlog
1012
from helm.common.multimodal_request_utils import get_contents_as_bytes
1113
from helm.common.optional_dependencies import handle_module_not_found_error
1214

@@ -55,15 +57,6 @@ def use_ffmpeg_to_convert_audio_file(input_path: str, output_path: str) -> None:
5557
raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to convert audio files.")
5658

5759

58-
def use_ffmpeg_to_extract_audio_from_video(input_video_path: str, output_audio_path: str) -> None:
59-
if os.path.exists(output_audio_path):
60-
return
61-
try:
62-
subprocess.run(["ffmpeg", "-i", input_video_path, "-q:a", "0", "-map", "a", output_audio_path], check=True)
63-
except (subprocess.CalledProcessError, FileNotFoundError):
64-
raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to extract audio files.")
65-
66-
6760
def is_invalid_audio_file(audio_path: str) -> bool:
6861
"""
6962
Two conditions for an audio file to be considered invalid:
@@ -78,3 +71,25 @@ def is_invalid_audio_file(audio_path: str) -> bool:
7871
return len(audio_file) == 0
7972
except RuntimeError:
8073
return True
74+
75+
76+
def extract_audio(video_path: str, output_audio_path: str) -> None:
77+
"""
78+
Extracts audio from an MP4 video file and saves it as an MP3 file.
79+
80+
Args:
81+
video_path (str): Path to the input MP4 video file.
82+
output_audio_path (str): Path to save the extracted MP3 audio file.
83+
84+
Returns:
85+
None
86+
"""
87+
try:
88+
(
89+
ffmpeg.input(video_path)
90+
.output(output_audio_path, format="mp3", acodec="libmp3lame", audio_bitrate="192k")
91+
.run(overwrite_output=True)
92+
)
93+
except ffmpeg.Error as e:
94+
hlog(f"Error extracting audio from video: {video_path}: {e.stderr.decode()}")
95+
raise e

0 commit comments

Comments
 (0)