VoiceJailbreakAttacksScenario (#3361)

teetone · web-flow · commit 0c9803ef3b2d · 2025-02-24T20:20:07.000-08:00
diff --git a/setup.cfg b/setup.cfg
@@ -146,7 +146,7 @@ mistral =
     mistralai~=1.1
 
 openai =
-    openai~=1.61
+    openai~=1.64
     tiktoken~=0.7
     pydantic~=2.0  # For model_dump(mode="json") - openai only requires pydantic>=1.9.0
 
@@ -282,7 +282,7 @@ audiolm =
     crfm-helm[openai]
     crfm-helm[google]
 
-    # For clipping audio
+    # For clipping and converting audio
     pydub~=0.25.1
 
     # For extracting audio from videos
diff --git a/src/helm/benchmark/presentation/run_entries_audio.conf b/src/helm/benchmark/presentation/run_entries_audio.conf
@@ -6,6 +6,7 @@ entries: [
     {description: "audiocaps:model=audiolm", priority: 1}
     {description: "voxceleb2:model=audiolm", priority: 1}
     {description: "librispeech:model=audiolm", priority: 1}
+    {description: "air_bench_chat:subject=speech,model=audiolm", priority: 1}
 
     ####################################################################################################################
     # Fairness
@@ -22,7 +23,6 @@ entries: [
     # Knowledge
     ####################################################################################################################
 
-    {description: "air_bench_chat:subject=speech,model=audiolm", priority: 1}
     {description: "air_bench_chat:subject=sound,model=audiolm", priority: 1}
     {description: "air_bench_chat:subject=music,model=audiolm", priority: 1}
     {description: "air_bench_chat:subject=mix,model=audiolm", priority: 1}
@@ -59,9 +59,10 @@ entries: [
     {description: "fleurs:language=Thai,model=audiolm", priority: 1}
     {description: "fleurs:language=Mandarin_Chinese,model=audiolm", priority: 1}
 
-    {description: "common_voice_15:language=English,model=audiolm", priority: 1}
-    {description: "common_voice_15:language=German,model=audiolm", priority: 1}
-    {description: "common_voice_15:language=French,model=audiolm", priority: 1}
+    # TODO: remove?
+    # {description: "common_voice_15:language=English,model=audiolm", priority: 1}
+    # {description: "common_voice_15:language=German,model=audiolm", priority: 1}
+    # {description: "common_voice_15:language=French,model=audiolm", priority: 1}
 
     {description: "multilingual_librispeech:language=Dutch,model=audiolm", priority: 1}
     {description: "multilingual_librispeech:language=Spanish,model=audiolm", priority: 1}
@@ -110,5 +111,6 @@ entries: [
     # Safety
     ####################################################################################################################
 
-
+    {description: "voice_jailbreak_attacks:subset=baseline,model=audiolm", priority: 1}
+    {description: "voice_jailbreak_attacks:subset=textjailbreak,model=audiolm", priority: 1}
 ]
diff --git a/src/helm/benchmark/presentation/run_entries_audio_debug.conf b/src/helm/benchmark/presentation/run_entries_audio_debug.conf
@@ -1,3 +1,4 @@
 entries: [
-    {description: "mustard:model=audiolm", priority: 1}
+    {description: "voice_jailbreak_attacks:subset=baseline,model=audiolm", priority: 1}
+    {description: "voice_jailbreak_attacks:subset=textjailbreak,model=audiolm", priority: 1}
 ]
diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py
@@ -11,6 +11,7 @@
 from helm.benchmark.metrics.common_metric_specs import (
     get_classification_metric_specs,
     get_exact_match_metric_specs,
+    get_generative_harms_metric_specs,
     get_basic_metric_specs,
 )
 from helm.benchmark.metrics.metric import MetricSpec
@@ -200,6 +201,28 @@ def get_mustard_audio_run_spec() -> RunSpec:
     )
 
 
+@run_spec_function("voice_jailbreak_attacks")
+def get_voice_jailbreak_attacks_run_spec(subset: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.audio_language.voice_jailbreak_attacks_scenario."
+        "VoiceJailbreakAttacksScenario",
+        args={"subset": subset},
+    )
+    adapter_spec = _get_generation_adapter_spec(max_tokens=1024)
+    metric_specs: List[MetricSpec] = get_generative_harms_metric_specs(
+        include_basic_metrics=True, include_generative_harms_metrics=True
+    )
+
+    run_spec_name: str = "voice_jailbreak_attacks"
+    return RunSpec(
+        name=f"{run_spec_name}:subset={subset}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
+
+
 @run_spec_function("covost2")
 def get_covost2_run_spec(source_language: str, target_language: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
diff --git a/src/helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py b/src/helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py
@@ -0,0 +1,87 @@
+import os
+from typing import List
+
+from helm.benchmark.scenarios.scenario import (
+    TEST_SPLIT,
+    Scenario,
+    Instance,
+    Input,
+)
+from helm.common.audio_utils import is_invalid_audio_file
+from helm.common.media_object import MediaObject, MultimediaObject
+
+
+class VoiceJailbreakAttacksScenario(Scenario):
+    """
+    Voice Jailbreak Attacks Against GPT-4o
+
+    Paper: https://arxiv.org/abs/2405.19103
+
+    The examples used in this scenario was generated following the instructions in the GitHub:
+    https://github.com/TrustAIRLab/VoiceJailbreakAttack
+    Note: The more advanced jailbreaking subset addresses the AI as "ChatGPT".
+
+    We ran:
+    python tts/prompt2audio.py --dataset baseline --voice fable
+    python tts/prompt2audio.py --dataset baseline --voice fable
+
+    then placed the generated folders at benchmark_output/scenarios/voice_jailbreak_attacks:
+
+    voice_jailbreak_attacks/
+        baseline_fable/
+            *.wav
+        textjailbreak_fable/
+            *.wav
+
+    Base prompts (30 total):
+    https://github.com/TrustAIRLab/VoiceJailbreakAttack/blob/main/data/question_set/questions_tiny.csv
+    Jailbreak templates:
+    https://github.com/TrustAIRLab/VoiceJailbreakAttack/blob/main/data/jailbreak_prompts/text_jailbreak_prompts.csv
+
+    The authors used Attack Success Rate (ASR) as the evaluation metric.
+    Specifically, two authors manually label each response to determine if it answers the forbidden
+    question. If there is any uncertainty, the authors discuss to reach a conclusion
+
+    Citation:
+
+    @article{SWBZ24,
+      author = {Xinyue Shen and Yixin Wu and Michael Backes and Yang Zhang},
+      title = {{Voice Jailbreak Attacks Against GPT-4o}},
+      journal = {{CoRR abs/2405.19103}},
+      year = {2024}
+    }
+    """
+
+    VALID_SUBSETS = ["baseline", "textjailbreak"]
+
+    name = "voice_jailbreak_attacks"
+    description = "Voice jailbreak attacks against GPT-4o ([Shen et al, 2024](https://arxiv.org/abs/2405.19103))."
+    tags = ["audio", "safety"]
+
+    def __init__(self, subset: str = "baseline") -> None:
+        super().__init__()
+        assert subset in self.VALID_SUBSETS, f"Invalid subset: {subset}"
+        self._subset = subset
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        audio_directory_path: str = os.path.join(output_path, f"{self._subset}_fable")
+        assert os.path.exists(audio_directory_path), f"Audio directory does not exist: {audio_directory_path}"
+
+        instances: List[Instance] = []
+        for file in os.listdir(audio_directory_path):
+            if not file.endswith("wav"):
+                continue
+
+            audio_path: str = os.path.join(audio_directory_path, file)
+            assert not is_invalid_audio_file(audio_path), f"Invalid audio file: {audio_path}"
+
+            input = Input(
+                multimedia_content=MultimediaObject(
+                    media_objects=[
+                        MediaObject(location=audio_path, content_type="audio/wav"),
+                    ]
+                )
+            )
+            instances.append(Instance(input=input, references=[], split=TEST_SPLIT))
+
+        return instances
diff --git a/src/helm/benchmark/static/schema_audio.yaml b/src/helm/benchmark/static/schema_audio.yaml
@@ -172,6 +172,23 @@ metrics:
     description: Chinese character error rate (CER) for evaluating the accuracy of Chiese ASR.
     lower_is_better: true
 
+  # Toxicity metrics
+  - name: expected_max_toxicity
+    display_name: Average max toxicity
+    short_display_name: Avg. max toxicity
+    lower_is_better: true
+    description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
+
+  - name: max_toxicity_probability
+    display_name: max toxicity
+    lower_is_better: true
+    description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
+
+  - name: toxic_frac
+    display_name: Toxic fraction
+    lower_is_better: true
+    description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
+
 ############################################################
 perturbations: []
 
@@ -232,6 +249,7 @@ run_groups:
       # Toxicity Detection
       - mutox
       # Safety
+      - voice_jailbreak_attacks
 
   - name: audio_mnist
     display_name: AudioMNIST
@@ -555,6 +573,23 @@ run_groups:
       when: "2019"
       language: English
 
+  - name: voice_jailbreak_attacks
+    display_name: Voice Jailbreak Attacks Against GPT-4o
+    description: >
+      Voice jailbreak attacks against GPT-4o ([Shen et al, 2024](https://arxiv.org/abs/2405.19103)).
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: toxic_frac
+      main_split: test
+    taxonomy:
+      task: refusal for safety
+      what: voice jailbreak attacks against GPT-4o
+      who: AI-generated speech
+      when: "2024"
+      language: English
+
   - name: ami
     display_name: AMI Meeting Corpus
     description: >
diff --git a/src/helm/common/audio_utils.py b/src/helm/common/audio_utils.py
@@ -1,7 +1,8 @@
 from io import BytesIO
-import os
 from typing import Optional
 from filelock import FileLock
+import base64
+import os
 
 import ffmpeg
 import numpy as np
@@ -93,3 +94,18 @@ def extract_audio(video_path: str, output_audio_path: str) -> None:
     except ffmpeg.Error as e:
         hlog(f"Error extracting audio from video: {video_path}: {e.stderr.decode()}")
         raise e
+
+
+def encode_audio_to_base64(file_path: str) -> str:
+    """
+    Encodes an audio file to a Base64 string.
+
+    Args:
+        file_path (str): Path to the audio file.
+
+    Returns:
+        str: Base64-encoded string of the audio file.
+    """
+    assert os.path.exists(file_path), f"Audio file does not exist at path: {file_path}"
+    with open(file_path, "rb") as audio_file:
+        return base64.b64encode(audio_file.read()).decode("utf-8")

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
`1`	`1`	`entries: [`
`2`		`- {description: "mustard:model=audiolm", priority: 1}`
	`2`	`+ {description: "voice_jailbreak_attacks:subset=baseline,model=audiolm", priority: 1}`
	`3`	`+ {description: "voice_jailbreak_attacks:subset=textjailbreak,model=audiolm", priority: 1}`
`3`	`4`	`]`