|
| 1 | +import json |
| 2 | +import os |
| 3 | +from typing import List |
| 4 | + |
| 5 | +from tqdm import tqdm |
| 6 | + |
| 7 | +from helm.benchmark.scenarios.scenario import ( |
| 8 | + TEST_SPLIT, |
| 9 | + Scenario, |
| 10 | + Instance, |
| 11 | + Reference, |
| 12 | + CORRECT_TAG, |
| 13 | + Input, |
| 14 | + Output, |
| 15 | +) |
| 16 | +from helm.common.audio_utils import is_invalid_audio_file, extract_audio |
| 17 | +from helm.common.media_object import MediaObject, MultimediaObject |
| 18 | +from helm.common.general import ensure_directory_exists, ensure_file_downloaded |
| 19 | + |
| 20 | + |
| 21 | +class MUStARDScenario(Scenario): |
| 22 | + """ |
| 23 | + MUStARD: Multimodal Sarcasm Detection Dataset |
| 24 | +
|
| 25 | + A multimodal video corpus for research in automated sarcasm discovery. The dataset is compiled from popular |
| 26 | + TV shows including Friends, The Golden Girls, The Big Bang Theory, and Sarcasmaholics Anonymous. MUStARD consists |
| 27 | + of audiovisual utterances annotated with sarcasm labels. Each utterance is accompanied by its context, providing |
| 28 | + additional information on the scenario where it occurs. |
| 29 | +
|
| 30 | + We just extract the audio from the given videos. |
| 31 | +
|
| 32 | + The columns of the dataset are: |
| 33 | + - utterance: The text of the target utterance to classify. |
| 34 | + - speaker: Speaker of the target utterance. |
| 35 | + - context: List of utterances (in chronological order) preceding the target utterance. |
| 36 | + - context_speakers: Respective speakers of the context utterances. |
| 37 | + - sarcasm: Binary label for sarcasm tag. |
| 38 | +
|
| 39 | + More specifically an example looks like this: |
| 40 | +
|
| 41 | + "1_60": { |
| 42 | + "utterance": "It's just a privilege to watch your mind at work.", |
| 43 | + "speaker": "SHELDON", |
| 44 | + "context": [ |
| 45 | + "I never would have identified the fingerprints of string theory in the aftermath of the Big Bang.", |
| 46 | + "My apologies. What's your plan?" |
| 47 | + ], |
| 48 | + "context_speakers": [ |
| 49 | + "LEONARD", |
| 50 | + "SHELDON" |
| 51 | + ], |
| 52 | + "show": "BBT", |
| 53 | + "sarcasm": true |
| 54 | + } |
| 55 | +
|
| 56 | + The key is the video id. |
| 57 | +
|
| 58 | + The video folder has two subfolders: |
| 59 | + - context_final: Contains the context videos (e.g., 1_60_c.mp4) |
| 60 | + - utterances_final: Contains the target utterance videos (e.g., 1_60.mp4) |
| 61 | +
|
| 62 | + Citation: |
| 63 | +
|
| 64 | + @inproceedings{mustard, |
| 65 | + title = "Towards Multimodal Sarcasm Detection (An \_Obviously\_ Perfect Paper)", |
| 66 | + author = "Castro, Santiago and |
| 67 | + Hazarika, Devamanyu and |
| 68 | + P{\'e}rez-Rosas, Ver{\'o}nica and |
| 69 | + Zimmermann, Roger and |
| 70 | + Mihalcea, Rada and |
| 71 | + Poria, Soujanya", |
| 72 | + booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics |
| 73 | + (Volume 1: Long Papers)", |
| 74 | + month = "7", |
| 75 | + year = "2019", |
| 76 | + address = "Florence, Italy", |
| 77 | + publisher = "Association for Computational Linguistics", |
| 78 | + } |
| 79 | + """ |
| 80 | + |
| 81 | + RAW_VIDEO_CLIPS_URL: str = "https://huggingface.co/datasets/MichiganNLP/MUStARD/resolve/main/mmsd_raw_data.zip" |
| 82 | + ANNOTATIONS_URL: str = ( |
| 83 | + "https://raw.githubusercontent.com/soujanyaporia/MUStARD/refs/heads/master/data/" "sarcasm_data.json" |
| 84 | + ) |
| 85 | + |
| 86 | + name = "mustard" |
| 87 | + description = "Sarcasm detection benchmark ([Castro et al, 2018](https://arxiv.org/abs/1906.01815))." |
| 88 | + tags = ["audio", "classification", "toxicity", "sarcasm detection"] |
| 89 | + |
| 90 | + def get_instances(self, output_path: str) -> List[Instance]: |
| 91 | + # Download the annotations |
| 92 | + annotations_path: str = os.path.join(output_path, "sarcasm_data.json") |
| 93 | + ensure_file_downloaded(self.ANNOTATIONS_URL, annotations_path) |
| 94 | + |
| 95 | + # Where the video files will be downloaded to |
| 96 | + video_path: str = os.path.join(output_path, "video") |
| 97 | + ensure_file_downloaded(self.RAW_VIDEO_CLIPS_URL, video_path, unpack=True) |
| 98 | + |
| 99 | + # Where the audio files will be extracted to |
| 100 | + audio_path: str = os.path.join(output_path, "audio") |
| 101 | + ensure_directory_exists(audio_path) |
| 102 | + |
| 103 | + instances: List[Instance] = [] |
| 104 | + annotations = json.load(open(annotations_path, "r")) |
| 105 | + for key, row in tqdm(annotations.items()): |
| 106 | + # Extract the audio from the context video |
| 107 | + context_audio_path: str = os.path.join(audio_path, f"{key}_c.mp3") |
| 108 | + if not os.path.exists(context_audio_path): |
| 109 | + # Extract the audio from the video |
| 110 | + context_video_path: str = os.path.join(video_path, "context_final", f"{key}_c.mp4") |
| 111 | + extract_audio(context_video_path, context_audio_path) |
| 112 | + assert not is_invalid_audio_file(context_audio_path), f"Invalid audio file: {context_audio_path}" |
| 113 | + |
| 114 | + # Extract the audio from the target utterance video |
| 115 | + utterance_audio_path: str = os.path.join(audio_path, f"{key}.mp3") |
| 116 | + if not os.path.exists(utterance_audio_path): |
| 117 | + utterance_video_path: str = os.path.join(video_path, "utterances_final", f"{key}.mp4") |
| 118 | + extract_audio(utterance_video_path, utterance_audio_path) |
| 119 | + assert not is_invalid_audio_file(utterance_audio_path), f"Invalid audio file: {utterance_audio_path}" |
| 120 | + |
| 121 | + input = Input( |
| 122 | + multimedia_content=MultimediaObject( |
| 123 | + media_objects=[ |
| 124 | + # Input both the context and the utterance audio |
| 125 | + MediaObject(text="Context:", content_type="text/plain"), |
| 126 | + MediaObject(location=context_audio_path, content_type="audio/mpeg"), |
| 127 | + MediaObject(text="Utterance:", content_type="text/plain"), |
| 128 | + MediaObject(location=utterance_audio_path, content_type="audio/mpeg"), |
| 129 | + MediaObject( |
| 130 | + text="Given the context, does the utterance contain sarcasm?", content_type="text/plain" |
| 131 | + ), |
| 132 | + ] |
| 133 | + ) |
| 134 | + ) |
| 135 | + is_sarcastic: bool = row["sarcasm"] |
| 136 | + references = [ |
| 137 | + Reference(Output(text="Yes"), tags=[CORRECT_TAG] if is_sarcastic else []), |
| 138 | + Reference(Output(text="No"), tags=[CORRECT_TAG] if not is_sarcastic else []), |
| 139 | + ] |
| 140 | + instances.append(Instance(input=input, references=references, split=TEST_SPLIT)) |
| 141 | + |
| 142 | + return instances |
0 commit comments