Skip to content

Commit ce04810

Browse files
authored
Fix Qwen2-Audio-Instruct empty outputs (#3474)
1 parent 9fa4cfb commit ce04810

File tree

2 files changed

+11
-17
lines changed

2 files changed

+11
-17
lines changed

src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ class VoxCeleb2Scenario(Scenario):
4040
"https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv"
4141
)
4242
IDENTITY_INSTRUCTION = (
43-
"Listen to the audio and take your best guess to determine if the two speakers are the same person."
43+
"Listen to the audio and take your best guess to determine if the two speakers are the same person. "
44+
"Give just the letter of your answer and nothing else."
4445
)
4546

4647
name = "voxceleb2"

src/helm/clients/audio_language/qwen2_audiolm_client.py

+9-16
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,6 @@ class Qwen2AudioLMClient(CachingClient):
4141
"""
4242

4343
END_OF_TEXT_TOKEN: str = "<|im_end|>"
44-
# The official recommendation is to set the prefix length to 256
45-
# https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct
46-
PREFIX_TOKEN_LENGTH: int = 256
4744

4845
def __init__(self, cache_config: CacheConfig):
4946
super().__init__(cache_config=cache_config)
@@ -84,11 +81,6 @@ def make_request(self, request: Request) -> RequestResult:
8481
model = loaded_model_processor.model
8582
tokenizer = loaded_model_processor.tokenizer
8683

87-
# Qwen2-Audio-Instruct counts input into the max_length, so we need to add the length of the prompt
88-
generation_args = {
89-
"max_length": request.max_tokens + self.PREFIX_TOKEN_LENGTH,
90-
}
91-
9284
input_query: List[Dict[str, Any]] = []
9385
query: List[Dict[str, str]] = []
9486
prompt_text: str = ""
@@ -142,10 +134,15 @@ def do_it() -> Dict[str, Any]:
142134
return_tensors="pt",
143135
padding=True,
144136
)
137+
input_length = inputs.input_ids.size(1)
138+
# Qwen2-Audio-Instruct counts input into the max_length,
139+
# so we need to add the length of the prompt
145140
inputs = inputs.to(self._device)
146-
pred = model.generate(**inputs, **generation_args)
147-
completion = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
141+
pred = model.generate(**inputs, max_length=request.max_tokens + input_length)[:, input_length:]
148142

143+
completion = tokenizer.decode(
144+
pred.cpu()[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
145+
)
149146
# The processor of Qwen2-Audio-Instruct consists an AutoTokenizer and a WhisperFeatureExtractor
150147
tokens: List[str] = tokenizer.tokenizer.tokenize(completion)
151148
return {"output": (completion, tokens)}
@@ -156,7 +153,7 @@ def do_it() -> Dict[str, Any]:
156153
"completion_index": completion_index,
157154
"model": request.model,
158155
"prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
159-
**generation_args,
156+
"max_tokens": request.max_tokens,
160157
},
161158
request=request,
162159
)
@@ -167,11 +164,7 @@ def do_it() -> Dict[str, Any]:
167164
)
168165

169166
text, tokens = result["output"]
170-
171-
# Truncate the output text as the original Qwen includes the prompt in the output sequence
172-
text = text[len(prompt_text) :]
173-
text = text.replace(self.END_OF_TEXT_TOKEN, "")
174-
hlog(f"Truncated: {text}")
167+
hlog(f"Generated: {text}")
175168

176169
# Tokenize truncated text to get the list of tokens
177170
completions.append(

0 commit comments

Comments
 (0)