|
| 1 | +from io import BytesIO |
| 2 | +from typing import List |
| 3 | +import os |
| 4 | +import requests |
| 5 | + |
| 6 | +from pydub import AudioSegment |
| 7 | +from tqdm import tqdm |
| 8 | +import pandas as pd |
| 9 | + |
| 10 | +from helm.benchmark.scenarios.scenario import ( |
| 11 | + TEST_SPLIT, |
| 12 | + Scenario, |
| 13 | + Instance, |
| 14 | + Reference, |
| 15 | + CORRECT_TAG, |
| 16 | + Input, |
| 17 | + Output, |
| 18 | +) |
| 19 | +from helm.common.audio_utils import is_invalid_audio_file |
| 20 | +from helm.common.media_object import MediaObject, MultimediaObject |
| 21 | +from helm.common.general import ensure_directory_exists, ensure_file_downloaded |
| 22 | +from helm.common.hierarchical_logger import hlog, htrack_block |
| 23 | + |
| 24 | + |
| 25 | +class MuToxScenario(Scenario): |
| 26 | + """ |
| 27 | + MuTox: MuTox: Universal MUltilingual Audio-based TOXicity Dataset and Zero-shot Detector |
| 28 | +
|
| 29 | + MuTox, the first highly multilingual audio-based dataset with toxicity labels. The dataset consists of 20k |
| 30 | + audio utterances for English and Spanish, and 4k for the other languages. To showcase the quality of this |
| 31 | + dataset, we train the MuTox audio-based toxicity classifier, which allows zero-shot toxicity detection across |
| 32 | + a broad range of languages. This classifier outperforms existing text-based trainable classifiers by more than |
| 33 | + 1% AUC, while increasing the language coverage from 8 to 100+ languages. When compared to a wordlist-based |
| 34 | + classifier that covers a similar number of languages, MuTox improves precision and recall by ∼2.5 times. |
| 35 | +
|
| 36 | + Languages: |
| 37 | + "Arabic": "arb", |
| 38 | + "Bengali": "ben", |
| 39 | + "Bulgarian": "bul", |
| 40 | + "Catalan": "cat", |
| 41 | + "Czech": "ces", |
| 42 | + "Mandarin Chinese": "cmn", |
| 43 | + "Danish": "dan", |
| 44 | + "German": "deu", |
| 45 | + "Greek": "ell", |
| 46 | + "English": "eng", |
| 47 | + "Estonian": "est", |
| 48 | + "Western Persian": "fas", |
| 49 | + "Finnish": "fin", |
| 50 | + "French": "fra", |
| 51 | + "Hebrew": "heb", |
| 52 | + "Hindi": "hin", |
| 53 | + "Hungarian": "hun", |
| 54 | + "Indonesian": "ind", |
| 55 | + "Italian": "ita", |
| 56 | + "Dutch": "nld", |
| 57 | + "Polish": "pol", |
| 58 | + "Portuguese": "por", |
| 59 | + "Russian": "rus", |
| 60 | + "Spanish": "spa", |
| 61 | + "Slovak": "slk", |
| 62 | + "Swahili": "swh", |
| 63 | + "Tagalog": "tgl", |
| 64 | + "Turkish": "tur", |
| 65 | + "Urdu": "urd", |
| 66 | + "Vietnamese": "vie", |
| 67 | +
|
| 68 | + The columns of the dataset are: |
| 69 | +
|
| 70 | + id: a string id of the segment; |
| 71 | + lang: 3-letter language code; |
| 72 | + partition: one of train, dev, or devtest |
| 73 | + public_url_segment: a string formatted as url:start:end, where start and end are indicated in milliseconds; |
| 74 | + audio_file_transcript: text transctiption of the segment; |
| 75 | + contains_toxicity, toxicity_types, perlocutionary_effects: annotation results as strings |
| 76 | + label: an integer label, equal to 1 if contains_toxicity equals Yes and 0 otherwise; |
| 77 | + etox_result: toxic word (or multiple words, separated by |) detected by the Etox matcher; |
| 78 | + detoxify_score: toxicity probabilities predicted by the Detoxify system (float numbers between 0 and 1); |
| 79 | + mutox_speech_score, mutox_text_score, mutox_zero_shot_speech_score, mutox_zero_shot_text_score: MuTox predictions |
| 80 | + as float numbers with any value (they can be interpreted as logits, |
| 81 | + i.e. probabilities before a sigmoid transformation). |
| 82 | +
|
| 83 | + Citation: |
| 84 | +
|
| 85 | + @misc{costajussà2023mutox, |
| 86 | + title={MuTox: Universal MUltilingual Audio-based TOXicity Dataset and Zero-shot Detector}, |
| 87 | + author={ Marta R. Costa-jussà, Mariano Coria Meglioli, Pierre Andrews, David Dale, Prangthip Hansanti, |
| 88 | + Elahe Kalbassi, Alex Mourachko, Christophe Ropers, Carleigh Wood}, |
| 89 | + year={2023}, |
| 90 | + eprint={}, |
| 91 | + archivePrefix={arXiv}, |
| 92 | + primaryClass={cs.CL} |
| 93 | + } |
| 94 | + """ |
| 95 | + |
| 96 | + ANNOTATIONS_URL = "https://dl.fbaipublicfiles.com/seamless/datasets/mutox.tsv" |
| 97 | + |
| 98 | + LANGAUGE_CODES = { |
| 99 | + "Arabic": "arb", |
| 100 | + "Bengali": "ben", |
| 101 | + "Bulgarian": "bul", |
| 102 | + "Catalan": "cat", |
| 103 | + "Czech": "ces", |
| 104 | + "Mandarin_Chinese": "cmn", |
| 105 | + "Danish": "dan", |
| 106 | + "German": "deu", |
| 107 | + "Greek": "ell", |
| 108 | + "English": "eng", |
| 109 | + "Estonian": "est", |
| 110 | + "Western_Persian": "fas", |
| 111 | + "Finnish": "fin", |
| 112 | + "French": "fra", |
| 113 | + "Hebrew": "heb", |
| 114 | + "Hindi": "hin", |
| 115 | + "Hungarian": "hun", |
| 116 | + "Indonesian": "ind", |
| 117 | + "Italian": "ita", |
| 118 | + "Dutch": "nld", |
| 119 | + "Polish": "pol", |
| 120 | + "Portuguese": "por", |
| 121 | + "Russian": "rus", |
| 122 | + "Spanish": "spa", |
| 123 | + "Slovak": "slk", |
| 124 | + "Swahili": "swh", |
| 125 | + "Tagalog": "tgl", |
| 126 | + "Turkish": "tur", |
| 127 | + "Urdu": "urd", |
| 128 | + "Vietnamese": "vie", |
| 129 | + } |
| 130 | + |
| 131 | + name = "mutox" |
| 132 | + description = "Toxicity detection benchmark ([Costa-jussà et al, 2024](https://arxiv.org/abs/2401.05060))." |
| 133 | + tags = ["audio", "classification", "toxicity "] |
| 134 | + |
| 135 | + @staticmethod |
| 136 | + def track_bad_audio_file(bad_audio_file: str, output_path: str) -> None: |
| 137 | + """ |
| 138 | + Many of the links do not exist or point to broken so we keep track of them |
| 139 | + and skip them in the future runs to significantly speed up gathering the instances. |
| 140 | + """ |
| 141 | + with open(output_path, "a") as f: |
| 142 | + f.write(bad_audio_file + "\n") |
| 143 | + |
| 144 | + def __init__(self, language: str) -> None: |
| 145 | + super().__init__() |
| 146 | + self._language_code: str = self.LANGAUGE_CODES[language] |
| 147 | + |
| 148 | + def get_instances(self, output_path: str) -> List[Instance]: |
| 149 | + # Download the annotations |
| 150 | + annotations_path: str = os.path.join(output_path, "mutox.tsv") |
| 151 | + ensure_file_downloaded(self.ANNOTATIONS_URL, annotations_path) |
| 152 | + |
| 153 | + # Read bad audio files |
| 154 | + bad_audio_files: set[str] = set() |
| 155 | + bad_audio_files_path: str = os.path.join(output_path, "bad_audio_files.txt") |
| 156 | + if os.path.exists(bad_audio_files_path): |
| 157 | + # Each line is the audio file name |
| 158 | + with open(bad_audio_files_path, "r") as f: |
| 159 | + for line in f: |
| 160 | + bad_audio_files.add(line.strip()) |
| 161 | + hlog(f"Found {len(bad_audio_files)} bad audio files.") |
| 162 | + |
| 163 | + # Where the audio files will be downloaded to |
| 164 | + audio_path: str = os.path.join(output_path, "audio") |
| 165 | + ensure_directory_exists(audio_path) |
| 166 | + |
| 167 | + instances: List[Instance] = [] |
| 168 | + df = pd.read_csv(annotations_path, delimiter="\t") |
| 169 | + hlog(f"Found {len(df)} rows in the dataset") |
| 170 | + |
| 171 | + valid_count: int = 0 |
| 172 | + total_count: int = 0 |
| 173 | + for row in tqdm(df.itertuples()): |
| 174 | + # Only proces examples that are in devtest and the language we're interested in |
| 175 | + if row.partition != "devtest": |
| 176 | + continue |
| 177 | + |
| 178 | + if row.lang != self._language_code: |
| 179 | + continue |
| 180 | + |
| 181 | + total_count += 1 |
| 182 | + |
| 183 | + # Discard known bad audio files |
| 184 | + audio_filename: str = f"{row.id}.mp3" |
| 185 | + with htrack_block(f"Processing audio file: {audio_filename}"): |
| 186 | + if audio_filename in bad_audio_files: |
| 187 | + hlog(f"Skipping this example -- known bad audio file: {audio_filename}") |
| 188 | + continue |
| 189 | + |
| 190 | + local_audio_path: str = os.path.join(audio_path, audio_filename) |
| 191 | + if not os.path.exists(local_audio_path): |
| 192 | + # The provided URL has the complete audio, so we need to download it and clip it |
| 193 | + # public_url_segment: a string formatted as url:start:end, |
| 194 | + if not isinstance(row.public_url_segment, str): |
| 195 | + # Sometimes URL is just a float causing an error. Skip those. |
| 196 | + hlog(f"Skipping this example -- invalid URL: {row.public_url_segment}") |
| 197 | + continue |
| 198 | + |
| 199 | + parts = row.public_url_segment.split() |
| 200 | + if len(parts) != 3: |
| 201 | + hlog(f"Skipping this example -- invalid URL: {row.public_url_segment}") |
| 202 | + continue |
| 203 | + |
| 204 | + audio_url, start_ms_str, end_ms_str = parts |
| 205 | + start_ms, end_ms = int(start_ms_str), int(end_ms_str) |
| 206 | + |
| 207 | + # Download the full audio file |
| 208 | + try: |
| 209 | + response = requests.get(audio_url, stream=True) |
| 210 | + response.raise_for_status() |
| 211 | + |
| 212 | + # Load audio from the downloaded file |
| 213 | + audio = AudioSegment.from_file(BytesIO(response.content)) |
| 214 | + |
| 215 | + # Clip the audio |
| 216 | + clipped_audio = audio[start_ms:end_ms] |
| 217 | + |
| 218 | + # Save the clipped file |
| 219 | + clipped_audio.export(local_audio_path, format="mp3") |
| 220 | + except Exception as e: |
| 221 | + # Some files are no longer available or invalid |
| 222 | + hlog( |
| 223 | + f"Skipping this example -- error downloading / processing audio file from {audio_url}: {e}" |
| 224 | + ) |
| 225 | + self.track_bad_audio_file(audio_filename, bad_audio_files_path) |
| 226 | + continue |
| 227 | + |
| 228 | + if is_invalid_audio_file(local_audio_path): |
| 229 | + hlog(f"Skipping this example -- Invalid audio file even after downloading at {local_audio_path}") |
| 230 | + self.track_bad_audio_file(audio_filename, bad_audio_files_path) |
| 231 | + continue |
| 232 | + else: |
| 233 | + valid_count += 1 |
| 234 | + hlog(f"Valid audio file found at {local_audio_path} (valid/total: {valid_count}/{total_count})") |
| 235 | + |
| 236 | + input = Input( |
| 237 | + multimedia_content=MultimediaObject( |
| 238 | + media_objects=[ |
| 239 | + # Following what was prompted annotators as outlined in the paper |
| 240 | + MediaObject(text="Does the utterance contain toxicity?", content_type="text/plain"), |
| 241 | + MediaObject(location=local_audio_path, content_type="audio/mpeg"), |
| 242 | + ] |
| 243 | + ) |
| 244 | + ) |
| 245 | + |
| 246 | + is_toxic: bool = row.label == 1 |
| 247 | + references = [ |
| 248 | + Reference(Output(text="Yes"), tags=[CORRECT_TAG] if is_toxic else []), |
| 249 | + Reference(Output(text="No"), tags=[CORRECT_TAG] if not is_toxic else []), |
| 250 | + ] |
| 251 | + instances.append(Instance(input=input, references=references, split=TEST_SPLIT)) |
| 252 | + |
| 253 | + assert len(instances) > 0, f"No instances found for language: {self._language_code}" |
| 254 | + return instances |
0 commit comments