Skip to content

Commit 6552e67

Browse files
teetoneImKeTT
andauthored
MuTox Scenario (#3343)
Co-authored-by: Haoqin Tu <tuisaac163@gmail.com>
1 parent 432dcd3 commit 6552e67

File tree

8 files changed

+335
-314
lines changed

8 files changed

+335
-314
lines changed

setup.cfg

+3
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,9 @@ audiolm =
282282
crfm-helm[openai]
283283
crfm-helm[google]
284284

285+
# For clipping audio
286+
pydub~=0.25.1
287+
285288
# For HuggingFace audio datasets
286289
soundfile~=0.12
287290
librosa~=0.10

src/helm/benchmark/presentation/run_entries_audio.conf

+35-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ entries: [
22
####################################################################################################################
33
# Auditory Perception
44
####################################################################################################################
5-
{description: "audio_mnist:model=audiolm", priority: 1}
65
{description: "meld_audio:model=audiolm", priority: 1}
76
{description: "vocal_sound:model=audiolm", priority: 1}
87
{description: "audiocaps:model=audiolm", priority: 1}
@@ -51,4 +50,39 @@ entries: [
5150
{description: "common_voice_15:language=German,model=audiolm", priority: 1}
5251
{description: "common_voice_15:language=French,model=audiolm", priority: 1}
5352

53+
####################################################################################################################
54+
# Toxicity
55+
####################################################################################################################
56+
57+
{description: "mutox:language=Arabic,model=audiolm", priority: 1}
58+
{description: "mutox:language=Bengali,model=audiolm", priority: 1}
59+
{description: "mutox:language=Bulgarian,model=audiolm", priority: 1}
60+
{description: "mutox:language=Catalan,model=audiolm", priority: 1}
61+
{description: "mutox:language=Czech,model=audiolm", priority: 1}
62+
{description: "mutox:language=Mandarin_Chinese,model=audiolm", priority: 1}
63+
{description: "mutox:language=Danish,model=audiolm", priority: 1}
64+
{description: "mutox:language=German,model=audiolm", priority: 1}
65+
{description: "mutox:language=Greek,model=audiolm", priority: 1}
66+
{description: "mutox:language=English,model=audiolm", priority: 1}
67+
{description: "mutox:language=Estonian,model=audiolm", priority: 1}
68+
# {description: "mutox:language=Western_Persian,model=audiolm", priority: 1}
69+
{description: "mutox:language=Finnish,model=audiolm", priority: 1}
70+
{description: "mutox:language=French,model=audiolm", priority: 1}
71+
{description: "mutox:language=Hebrew,model=audiolm", priority: 1}
72+
{description: "mutox:language=Hindi,model=audiolm", priority: 1}
73+
{description: "mutox:language=Hungarian,model=audiolm", priority: 1}
74+
{description: "mutox:language=Indonesian,model=audiolm", priority: 1}
75+
{description: "mutox:language=Italian,model=audiolm", priority: 1}
76+
{description: "mutox:language=Dutch,model=audiolm", priority: 1}
77+
{description: "mutox:language=Polish,model=audiolm", priority: 1}
78+
{description: "mutox:language=Portuguese,model=audiolm", priority: 1}
79+
{description: "mutox:language=Russian,model=audiolm", priority: 1}
80+
{description: "mutox:language=Spanish,model=audiolm", priority: 1}
81+
{description: "mutox:language=Slovak,model=audiolm", priority: 1}
82+
{description: "mutox:language=Swahili,model=audiolm", priority: 1}
83+
{description: "mutox:language=Tagalog,model=audiolm", priority: 1}
84+
{description: "mutox:language=Turkish,model=audiolm", priority: 1}
85+
{description: "mutox:language=Urdu,model=audiolm", priority: 1}
86+
{description: "mutox:language=Vietnamese,model=audiolm", priority: 1}
87+
]
5488
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
entries: [
2+
{description: "mutox:language=Arabic,model=audiolm", priority: 1}
3+
]

src/helm/benchmark/presentation/run_entries_audio_v0.conf

-7
This file was deleted.

src/helm/benchmark/run_specs/audio_run_specs.py

+20
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,26 @@ def get_meld_audio_run_spec() -> RunSpec:
163163
)
164164

165165

166+
@run_spec_function("mutox")
167+
def get_mutox_audio_run_spec(language: str) -> RunSpec:
168+
scenario_spec = ScenarioSpec(
169+
class_name="helm.benchmark.scenarios.audio_language.mutox_scenario.MuToxScenario",
170+
args={"language": language},
171+
)
172+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
173+
input_noun=None, output_noun="Answer", max_train_instances=0
174+
)
175+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
176+
run_spec_name: str = "mutox"
177+
return RunSpec(
178+
name=f"{run_spec_name}:language={language}",
179+
scenario_spec=scenario_spec,
180+
adapter_spec=adapter_spec,
181+
metric_specs=metric_specs,
182+
groups=[run_spec_name],
183+
)
184+
185+
166186
@run_spec_function("covost2")
167187
def get_covost2_run_spec(source_language: str, target_language: str) -> RunSpec:
168188
scenario_spec = ScenarioSpec(
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
from io import BytesIO
2+
from typing import List
3+
import os
4+
import requests
5+
6+
from pydub import AudioSegment
7+
from tqdm import tqdm
8+
import pandas as pd
9+
10+
from helm.benchmark.scenarios.scenario import (
11+
TEST_SPLIT,
12+
Scenario,
13+
Instance,
14+
Reference,
15+
CORRECT_TAG,
16+
Input,
17+
Output,
18+
)
19+
from helm.common.audio_utils import is_invalid_audio_file
20+
from helm.common.media_object import MediaObject, MultimediaObject
21+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
22+
from helm.common.hierarchical_logger import hlog, htrack_block
23+
24+
25+
class MuToxScenario(Scenario):
26+
"""
27+
MuTox: MuTox: Universal MUltilingual Audio-based TOXicity Dataset and Zero-shot Detector
28+
29+
MuTox, the first highly multilingual audio-based dataset with toxicity labels. The dataset consists of 20k
30+
audio utterances for English and Spanish, and 4k for the other languages. To showcase the quality of this
31+
dataset, we train the MuTox audio-based toxicity classifier, which allows zero-shot toxicity detection across
32+
a broad range of languages. This classifier outperforms existing text-based trainable classifiers by more than
33+
1% AUC, while increasing the language coverage from 8 to 100+ languages. When compared to a wordlist-based
34+
classifier that covers a similar number of languages, MuTox improves precision and recall by ∼2.5 times.
35+
36+
Languages:
37+
"Arabic": "arb",
38+
"Bengali": "ben",
39+
"Bulgarian": "bul",
40+
"Catalan": "cat",
41+
"Czech": "ces",
42+
"Mandarin Chinese": "cmn",
43+
"Danish": "dan",
44+
"German": "deu",
45+
"Greek": "ell",
46+
"English": "eng",
47+
"Estonian": "est",
48+
"Western Persian": "fas",
49+
"Finnish": "fin",
50+
"French": "fra",
51+
"Hebrew": "heb",
52+
"Hindi": "hin",
53+
"Hungarian": "hun",
54+
"Indonesian": "ind",
55+
"Italian": "ita",
56+
"Dutch": "nld",
57+
"Polish": "pol",
58+
"Portuguese": "por",
59+
"Russian": "rus",
60+
"Spanish": "spa",
61+
"Slovak": "slk",
62+
"Swahili": "swh",
63+
"Tagalog": "tgl",
64+
"Turkish": "tur",
65+
"Urdu": "urd",
66+
"Vietnamese": "vie",
67+
68+
The columns of the dataset are:
69+
70+
id: a string id of the segment;
71+
lang: 3-letter language code;
72+
partition: one of train, dev, or devtest
73+
public_url_segment: a string formatted as url:start:end, where start and end are indicated in milliseconds;
74+
audio_file_transcript: text transctiption of the segment;
75+
contains_toxicity, toxicity_types, perlocutionary_effects: annotation results as strings
76+
label: an integer label, equal to 1 if contains_toxicity equals Yes and 0 otherwise;
77+
etox_result: toxic word (or multiple words, separated by |) detected by the Etox matcher;
78+
detoxify_score: toxicity probabilities predicted by the Detoxify system (float numbers between 0 and 1);
79+
mutox_speech_score, mutox_text_score, mutox_zero_shot_speech_score, mutox_zero_shot_text_score: MuTox predictions
80+
as float numbers with any value (they can be interpreted as logits,
81+
i.e. probabilities before a sigmoid transformation).
82+
83+
Citation:
84+
85+
@misc{costajussà2023mutox,
86+
title={MuTox: Universal MUltilingual Audio-based TOXicity Dataset and Zero-shot Detector},
87+
author={ Marta R. Costa-jussà, Mariano Coria Meglioli, Pierre Andrews, David Dale, Prangthip Hansanti,
88+
Elahe Kalbassi, Alex Mourachko, Christophe Ropers, Carleigh Wood},
89+
year={2023},
90+
eprint={},
91+
archivePrefix={arXiv},
92+
primaryClass={cs.CL}
93+
}
94+
"""
95+
96+
ANNOTATIONS_URL = "https://dl.fbaipublicfiles.com/seamless/datasets/mutox.tsv"
97+
98+
LANGAUGE_CODES = {
99+
"Arabic": "arb",
100+
"Bengali": "ben",
101+
"Bulgarian": "bul",
102+
"Catalan": "cat",
103+
"Czech": "ces",
104+
"Mandarin_Chinese": "cmn",
105+
"Danish": "dan",
106+
"German": "deu",
107+
"Greek": "ell",
108+
"English": "eng",
109+
"Estonian": "est",
110+
"Western_Persian": "fas",
111+
"Finnish": "fin",
112+
"French": "fra",
113+
"Hebrew": "heb",
114+
"Hindi": "hin",
115+
"Hungarian": "hun",
116+
"Indonesian": "ind",
117+
"Italian": "ita",
118+
"Dutch": "nld",
119+
"Polish": "pol",
120+
"Portuguese": "por",
121+
"Russian": "rus",
122+
"Spanish": "spa",
123+
"Slovak": "slk",
124+
"Swahili": "swh",
125+
"Tagalog": "tgl",
126+
"Turkish": "tur",
127+
"Urdu": "urd",
128+
"Vietnamese": "vie",
129+
}
130+
131+
name = "mutox"
132+
description = "Toxicity detection benchmark ([Costa-jussà et al, 2024](https://arxiv.org/abs/2401.05060))."
133+
tags = ["audio", "classification", "toxicity "]
134+
135+
@staticmethod
136+
def track_bad_audio_file(bad_audio_file: str, output_path: str) -> None:
137+
"""
138+
Many of the links do not exist or point to broken so we keep track of them
139+
and skip them in the future runs to significantly speed up gathering the instances.
140+
"""
141+
with open(output_path, "a") as f:
142+
f.write(bad_audio_file + "\n")
143+
144+
def __init__(self, language: str) -> None:
145+
super().__init__()
146+
self._language_code: str = self.LANGAUGE_CODES[language]
147+
148+
def get_instances(self, output_path: str) -> List[Instance]:
149+
# Download the annotations
150+
annotations_path: str = os.path.join(output_path, "mutox.tsv")
151+
ensure_file_downloaded(self.ANNOTATIONS_URL, annotations_path)
152+
153+
# Read bad audio files
154+
bad_audio_files: set[str] = set()
155+
bad_audio_files_path: str = os.path.join(output_path, "bad_audio_files.txt")
156+
if os.path.exists(bad_audio_files_path):
157+
# Each line is the audio file name
158+
with open(bad_audio_files_path, "r") as f:
159+
for line in f:
160+
bad_audio_files.add(line.strip())
161+
hlog(f"Found {len(bad_audio_files)} bad audio files.")
162+
163+
# Where the audio files will be downloaded to
164+
audio_path: str = os.path.join(output_path, "audio")
165+
ensure_directory_exists(audio_path)
166+
167+
instances: List[Instance] = []
168+
df = pd.read_csv(annotations_path, delimiter="\t")
169+
hlog(f"Found {len(df)} rows in the dataset")
170+
171+
valid_count: int = 0
172+
total_count: int = 0
173+
for row in tqdm(df.itertuples()):
174+
# Only proces examples that are in devtest and the language we're interested in
175+
if row.partition != "devtest":
176+
continue
177+
178+
if row.lang != self._language_code:
179+
continue
180+
181+
total_count += 1
182+
183+
# Discard known bad audio files
184+
audio_filename: str = f"{row.id}.mp3"
185+
with htrack_block(f"Processing audio file: {audio_filename}"):
186+
if audio_filename in bad_audio_files:
187+
hlog(f"Skipping this example -- known bad audio file: {audio_filename}")
188+
continue
189+
190+
local_audio_path: str = os.path.join(audio_path, audio_filename)
191+
if not os.path.exists(local_audio_path):
192+
# The provided URL has the complete audio, so we need to download it and clip it
193+
# public_url_segment: a string formatted as url:start:end,
194+
if not isinstance(row.public_url_segment, str):
195+
# Sometimes URL is just a float causing an error. Skip those.
196+
hlog(f"Skipping this example -- invalid URL: {row.public_url_segment}")
197+
continue
198+
199+
parts = row.public_url_segment.split()
200+
if len(parts) != 3:
201+
hlog(f"Skipping this example -- invalid URL: {row.public_url_segment}")
202+
continue
203+
204+
audio_url, start_ms_str, end_ms_str = parts
205+
start_ms, end_ms = int(start_ms_str), int(end_ms_str)
206+
207+
# Download the full audio file
208+
try:
209+
response = requests.get(audio_url, stream=True)
210+
response.raise_for_status()
211+
212+
# Load audio from the downloaded file
213+
audio = AudioSegment.from_file(BytesIO(response.content))
214+
215+
# Clip the audio
216+
clipped_audio = audio[start_ms:end_ms]
217+
218+
# Save the clipped file
219+
clipped_audio.export(local_audio_path, format="mp3")
220+
except Exception as e:
221+
# Some files are no longer available or invalid
222+
hlog(
223+
f"Skipping this example -- error downloading / processing audio file from {audio_url}: {e}"
224+
)
225+
self.track_bad_audio_file(audio_filename, bad_audio_files_path)
226+
continue
227+
228+
if is_invalid_audio_file(local_audio_path):
229+
hlog(f"Skipping this example -- Invalid audio file even after downloading at {local_audio_path}")
230+
self.track_bad_audio_file(audio_filename, bad_audio_files_path)
231+
continue
232+
else:
233+
valid_count += 1
234+
hlog(f"Valid audio file found at {local_audio_path} (valid/total: {valid_count}/{total_count})")
235+
236+
input = Input(
237+
multimedia_content=MultimediaObject(
238+
media_objects=[
239+
# Following what was prompted annotators as outlined in the paper
240+
MediaObject(text="Does the utterance contain toxicity?", content_type="text/plain"),
241+
MediaObject(location=local_audio_path, content_type="audio/mpeg"),
242+
]
243+
)
244+
)
245+
246+
is_toxic: bool = row.label == 1
247+
references = [
248+
Reference(Output(text="Yes"), tags=[CORRECT_TAG] if is_toxic else []),
249+
Reference(Output(text="No"), tags=[CORRECT_TAG] if not is_toxic else []),
250+
]
251+
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
252+
253+
assert len(instances) > 0, f"No instances found for language: {self._language_code}"
254+
return instances

src/helm/benchmark/static/schema_audio.yaml

+20-1
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ run_groups:
220220
- speech_robust_bench
221221
- meld_audio
222222
- air_bench_chat
223+
- mutox
223224

224225
- name: audio_mnist
225226
display_name: AudioMNIST
@@ -504,4 +505,22 @@ run_groups:
504505
what: audio, question, and answer of audio samples
505506
who: real speakers
506507
when: "2024"
507-
language: English
508+
language: English
509+
510+
- name: mutox
511+
display_name: MuTox
512+
description: >
513+
The dataset consists of 20k audio utterances for English and Spanish, and 4k for the other languages.
514+
([Costa-jussà et al, 2018](https://arxiv.org/abs/2401.05060)).
515+
metric_groups:
516+
- accuracy
517+
- general_information
518+
environment:
519+
main_name: exact_match
520+
main_split: test
521+
taxonomy:
522+
task: toxicity detection
523+
what: samples of utterances
524+
who: real speakers
525+
when: "2024"
526+
language: 30 langguages

0 commit comments

Comments
 (0)