Skip to content

Commit 0c9803e

Browse files
authored
VoiceJailbreakAttacksScenario (#3361)
1 parent e16e127 commit 0c9803e

File tree

7 files changed

+173
-9
lines changed

7 files changed

+173
-9
lines changed

setup.cfg

+2-2
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ mistral =
146146
mistralai~=1.1
147147

148148
openai =
149-
openai~=1.61
149+
openai~=1.64
150150
tiktoken~=0.7
151151
pydantic~=2.0 # For model_dump(mode="json") - openai only requires pydantic>=1.9.0
152152

@@ -282,7 +282,7 @@ audiolm =
282282
crfm-helm[openai]
283283
crfm-helm[google]
284284

285-
# For clipping audio
285+
# For clipping and converting audio
286286
pydub~=0.25.1
287287

288288
# For extracting audio from videos

src/helm/benchmark/presentation/run_entries_audio.conf

+7-5
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ entries: [
66
{description: "audiocaps:model=audiolm", priority: 1}
77
{description: "voxceleb2:model=audiolm", priority: 1}
88
{description: "librispeech:model=audiolm", priority: 1}
9+
{description: "air_bench_chat:subject=speech,model=audiolm", priority: 1}
910

1011
####################################################################################################################
1112
# Fairness
@@ -22,7 +23,6 @@ entries: [
2223
# Knowledge
2324
####################################################################################################################
2425

25-
{description: "air_bench_chat:subject=speech,model=audiolm", priority: 1}
2626
{description: "air_bench_chat:subject=sound,model=audiolm", priority: 1}
2727
{description: "air_bench_chat:subject=music,model=audiolm", priority: 1}
2828
{description: "air_bench_chat:subject=mix,model=audiolm", priority: 1}
@@ -59,9 +59,10 @@ entries: [
5959
{description: "fleurs:language=Thai,model=audiolm", priority: 1}
6060
{description: "fleurs:language=Mandarin_Chinese,model=audiolm", priority: 1}
6161

62-
{description: "common_voice_15:language=English,model=audiolm", priority: 1}
63-
{description: "common_voice_15:language=German,model=audiolm", priority: 1}
64-
{description: "common_voice_15:language=French,model=audiolm", priority: 1}
62+
# TODO: remove?
63+
# {description: "common_voice_15:language=English,model=audiolm", priority: 1}
64+
# {description: "common_voice_15:language=German,model=audiolm", priority: 1}
65+
# {description: "common_voice_15:language=French,model=audiolm", priority: 1}
6566

6667
{description: "multilingual_librispeech:language=Dutch,model=audiolm", priority: 1}
6768
{description: "multilingual_librispeech:language=Spanish,model=audiolm", priority: 1}
@@ -110,5 +111,6 @@ entries: [
110111
# Safety
111112
####################################################################################################################
112113

113-
114+
{description: "voice_jailbreak_attacks:subset=baseline,model=audiolm", priority: 1}
115+
{description: "voice_jailbreak_attacks:subset=textjailbreak,model=audiolm", priority: 1}
114116
]
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
entries: [
2-
{description: "mustard:model=audiolm", priority: 1}
2+
{description: "voice_jailbreak_attacks:subset=baseline,model=audiolm", priority: 1}
3+
{description: "voice_jailbreak_attacks:subset=textjailbreak,model=audiolm", priority: 1}
34
]

src/helm/benchmark/run_specs/audio_run_specs.py

+23
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from helm.benchmark.metrics.common_metric_specs import (
1212
get_classification_metric_specs,
1313
get_exact_match_metric_specs,
14+
get_generative_harms_metric_specs,
1415
get_basic_metric_specs,
1516
)
1617
from helm.benchmark.metrics.metric import MetricSpec
@@ -200,6 +201,28 @@ def get_mustard_audio_run_spec() -> RunSpec:
200201
)
201202

202203

204+
@run_spec_function("voice_jailbreak_attacks")
205+
def get_voice_jailbreak_attacks_run_spec(subset: str) -> RunSpec:
206+
scenario_spec = ScenarioSpec(
207+
class_name="helm.benchmark.scenarios.audio_language.voice_jailbreak_attacks_scenario."
208+
"VoiceJailbreakAttacksScenario",
209+
args={"subset": subset},
210+
)
211+
adapter_spec = _get_generation_adapter_spec(max_tokens=1024)
212+
metric_specs: List[MetricSpec] = get_generative_harms_metric_specs(
213+
include_basic_metrics=True, include_generative_harms_metrics=True
214+
)
215+
216+
run_spec_name: str = "voice_jailbreak_attacks"
217+
return RunSpec(
218+
name=f"{run_spec_name}:subset={subset}",
219+
scenario_spec=scenario_spec,
220+
adapter_spec=adapter_spec,
221+
metric_specs=metric_specs,
222+
groups=[run_spec_name],
223+
)
224+
225+
203226
@run_spec_function("covost2")
204227
def get_covost2_run_spec(source_language: str, target_language: str) -> RunSpec:
205228
scenario_spec = ScenarioSpec(
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import os
2+
from typing import List
3+
4+
from helm.benchmark.scenarios.scenario import (
5+
TEST_SPLIT,
6+
Scenario,
7+
Instance,
8+
Input,
9+
)
10+
from helm.common.audio_utils import is_invalid_audio_file
11+
from helm.common.media_object import MediaObject, MultimediaObject
12+
13+
14+
class VoiceJailbreakAttacksScenario(Scenario):
15+
"""
16+
Voice Jailbreak Attacks Against GPT-4o
17+
18+
Paper: https://arxiv.org/abs/2405.19103
19+
20+
The examples used in this scenario was generated following the instructions in the GitHub:
21+
https://github.com/TrustAIRLab/VoiceJailbreakAttack
22+
Note: The more advanced jailbreaking subset addresses the AI as "ChatGPT".
23+
24+
We ran:
25+
python tts/prompt2audio.py --dataset baseline --voice fable
26+
python tts/prompt2audio.py --dataset baseline --voice fable
27+
28+
then placed the generated folders at benchmark_output/scenarios/voice_jailbreak_attacks:
29+
30+
voice_jailbreak_attacks/
31+
baseline_fable/
32+
*.wav
33+
textjailbreak_fable/
34+
*.wav
35+
36+
Base prompts (30 total):
37+
https://github.com/TrustAIRLab/VoiceJailbreakAttack/blob/main/data/question_set/questions_tiny.csv
38+
Jailbreak templates:
39+
https://github.com/TrustAIRLab/VoiceJailbreakAttack/blob/main/data/jailbreak_prompts/text_jailbreak_prompts.csv
40+
41+
The authors used Attack Success Rate (ASR) as the evaluation metric.
42+
Specifically, two authors manually label each response to determine if it answers the forbidden
43+
question. If there is any uncertainty, the authors discuss to reach a conclusion
44+
45+
Citation:
46+
47+
@article{SWBZ24,
48+
author = {Xinyue Shen and Yixin Wu and Michael Backes and Yang Zhang},
49+
title = {{Voice Jailbreak Attacks Against GPT-4o}},
50+
journal = {{CoRR abs/2405.19103}},
51+
year = {2024}
52+
}
53+
"""
54+
55+
VALID_SUBSETS = ["baseline", "textjailbreak"]
56+
57+
name = "voice_jailbreak_attacks"
58+
description = "Voice jailbreak attacks against GPT-4o ([Shen et al, 2024](https://arxiv.org/abs/2405.19103))."
59+
tags = ["audio", "safety"]
60+
61+
def __init__(self, subset: str = "baseline") -> None:
62+
super().__init__()
63+
assert subset in self.VALID_SUBSETS, f"Invalid subset: {subset}"
64+
self._subset = subset
65+
66+
def get_instances(self, output_path: str) -> List[Instance]:
67+
audio_directory_path: str = os.path.join(output_path, f"{self._subset}_fable")
68+
assert os.path.exists(audio_directory_path), f"Audio directory does not exist: {audio_directory_path}"
69+
70+
instances: List[Instance] = []
71+
for file in os.listdir(audio_directory_path):
72+
if not file.endswith("wav"):
73+
continue
74+
75+
audio_path: str = os.path.join(audio_directory_path, file)
76+
assert not is_invalid_audio_file(audio_path), f"Invalid audio file: {audio_path}"
77+
78+
input = Input(
79+
multimedia_content=MultimediaObject(
80+
media_objects=[
81+
MediaObject(location=audio_path, content_type="audio/wav"),
82+
]
83+
)
84+
)
85+
instances.append(Instance(input=input, references=[], split=TEST_SPLIT))
86+
87+
return instances

src/helm/benchmark/static/schema_audio.yaml

+35
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,23 @@ metrics:
172172
description: Chinese character error rate (CER) for evaluating the accuracy of Chiese ASR.
173173
lower_is_better: true
174174

175+
# Toxicity metrics
176+
- name: expected_max_toxicity
177+
display_name: Average max toxicity
178+
short_display_name: Avg. max toxicity
179+
lower_is_better: true
180+
description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
181+
182+
- name: max_toxicity_probability
183+
display_name: max toxicity
184+
lower_is_better: true
185+
description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
186+
187+
- name: toxic_frac
188+
display_name: Toxic fraction
189+
lower_is_better: true
190+
description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
191+
175192
############################################################
176193
perturbations: []
177194

@@ -232,6 +249,7 @@ run_groups:
232249
# Toxicity Detection
233250
- mutox
234251
# Safety
252+
- voice_jailbreak_attacks
235253

236254
- name: audio_mnist
237255
display_name: AudioMNIST
@@ -555,6 +573,23 @@ run_groups:
555573
when: "2019"
556574
language: English
557575

576+
- name: voice_jailbreak_attacks
577+
display_name: Voice Jailbreak Attacks Against GPT-4o
578+
description: >
579+
Voice jailbreak attacks against GPT-4o ([Shen et al, 2024](https://arxiv.org/abs/2405.19103)).
580+
metric_groups:
581+
- accuracy
582+
- general_information
583+
environment:
584+
main_name: toxic_frac
585+
main_split: test
586+
taxonomy:
587+
task: refusal for safety
588+
what: voice jailbreak attacks against GPT-4o
589+
who: AI-generated speech
590+
when: "2024"
591+
language: English
592+
558593
- name: ami
559594
display_name: AMI Meeting Corpus
560595
description: >

src/helm/common/audio_utils.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from io import BytesIO
2-
import os
32
from typing import Optional
43
from filelock import FileLock
4+
import base64
5+
import os
56

67
import ffmpeg
78
import numpy as np
@@ -93,3 +94,18 @@ def extract_audio(video_path: str, output_audio_path: str) -> None:
9394
except ffmpeg.Error as e:
9495
hlog(f"Error extracting audio from video: {video_path}: {e.stderr.decode()}")
9596
raise e
97+
98+
99+
def encode_audio_to_base64(file_path: str) -> str:
100+
"""
101+
Encodes an audio file to a Base64 string.
102+
103+
Args:
104+
file_path (str): Path to the audio file.
105+
106+
Returns:
107+
str: Base64-encoded string of the audio file.
108+
"""
109+
assert os.path.exists(file_path), f"Audio file does not exist at path: {file_path}"
110+
with open(file_path, "rb") as audio_file:
111+
return base64.b64encode(audio_file.read()).decode("utf-8")

0 commit comments

Comments
 (0)