|
| 1 | +import os |
| 2 | +from typing import List |
| 3 | + |
| 4 | +from helm.benchmark.scenarios.scenario import ( |
| 5 | + TEST_SPLIT, |
| 6 | + Scenario, |
| 7 | + Instance, |
| 8 | + Input, |
| 9 | +) |
| 10 | +from helm.common.audio_utils import is_invalid_audio_file |
| 11 | +from helm.common.media_object import MediaObject, MultimediaObject |
| 12 | + |
| 13 | + |
| 14 | +class VoiceJailbreakAttacksScenario(Scenario): |
| 15 | + """ |
| 16 | + Voice Jailbreak Attacks Against GPT-4o |
| 17 | +
|
| 18 | + Paper: https://arxiv.org/abs/2405.19103 |
| 19 | +
|
| 20 | + The examples used in this scenario was generated following the instructions in the GitHub: |
| 21 | + https://github.com/TrustAIRLab/VoiceJailbreakAttack |
| 22 | + Note: The more advanced jailbreaking subset addresses the AI as "ChatGPT". |
| 23 | +
|
| 24 | + We ran: |
| 25 | + python tts/prompt2audio.py --dataset baseline --voice fable |
| 26 | + python tts/prompt2audio.py --dataset baseline --voice fable |
| 27 | +
|
| 28 | + then placed the generated folders at benchmark_output/scenarios/voice_jailbreak_attacks: |
| 29 | +
|
| 30 | + voice_jailbreak_attacks/ |
| 31 | + baseline_fable/ |
| 32 | + *.wav |
| 33 | + textjailbreak_fable/ |
| 34 | + *.wav |
| 35 | +
|
| 36 | + Base prompts (30 total): |
| 37 | + https://github.com/TrustAIRLab/VoiceJailbreakAttack/blob/main/data/question_set/questions_tiny.csv |
| 38 | + Jailbreak templates: |
| 39 | + https://github.com/TrustAIRLab/VoiceJailbreakAttack/blob/main/data/jailbreak_prompts/text_jailbreak_prompts.csv |
| 40 | +
|
| 41 | + The authors used Attack Success Rate (ASR) as the evaluation metric. |
| 42 | + Specifically, two authors manually label each response to determine if it answers the forbidden |
| 43 | + question. If there is any uncertainty, the authors discuss to reach a conclusion |
| 44 | +
|
| 45 | + Citation: |
| 46 | +
|
| 47 | + @article{SWBZ24, |
| 48 | + author = {Xinyue Shen and Yixin Wu and Michael Backes and Yang Zhang}, |
| 49 | + title = {{Voice Jailbreak Attacks Against GPT-4o}}, |
| 50 | + journal = {{CoRR abs/2405.19103}}, |
| 51 | + year = {2024} |
| 52 | + } |
| 53 | + """ |
| 54 | + |
| 55 | + VALID_SUBSETS = ["baseline", "textjailbreak"] |
| 56 | + |
| 57 | + name = "voice_jailbreak_attacks" |
| 58 | + description = "Voice jailbreak attacks against GPT-4o ([Shen et al, 2024](https://arxiv.org/abs/2405.19103))." |
| 59 | + tags = ["audio", "safety"] |
| 60 | + |
| 61 | + def __init__(self, subset: str = "baseline") -> None: |
| 62 | + super().__init__() |
| 63 | + assert subset in self.VALID_SUBSETS, f"Invalid subset: {subset}" |
| 64 | + self._subset = subset |
| 65 | + |
| 66 | + def get_instances(self, output_path: str) -> List[Instance]: |
| 67 | + audio_directory_path: str = os.path.join(output_path, f"{self._subset}_fable") |
| 68 | + assert os.path.exists(audio_directory_path), f"Audio directory does not exist: {audio_directory_path}" |
| 69 | + |
| 70 | + instances: List[Instance] = [] |
| 71 | + for file in os.listdir(audio_directory_path): |
| 72 | + if not file.endswith("wav"): |
| 73 | + continue |
| 74 | + |
| 75 | + audio_path: str = os.path.join(audio_directory_path, file) |
| 76 | + assert not is_invalid_audio_file(audio_path), f"Invalid audio file: {audio_path}" |
| 77 | + |
| 78 | + input = Input( |
| 79 | + multimedia_content=MultimediaObject( |
| 80 | + media_objects=[ |
| 81 | + MediaObject(location=audio_path, content_type="audio/wav"), |
| 82 | + ] |
| 83 | + ) |
| 84 | + ) |
| 85 | + instances.append(Instance(input=input, references=[], split=TEST_SPLIT)) |
| 86 | + |
| 87 | + return instances |
0 commit comments