stanford-crfm · teetone · Mar 26, 2025 · Mar 25, 2025 · Mar 26, 2025 · Mar 26, 2025
diff --git a/src/helm/clients/audio_language/qwen2_audiolm_client.py b/src/helm/clients/audio_language/qwen2_audiolm_client.py
@@ -41,9 +41,6 @@ class Qwen2AudioLMClient(CachingClient):
     """
 
     END_OF_TEXT_TOKEN: str = "<|im_end|>"
-    # The official recommendation is to set the prefix length to 256
-    # https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct
-    PREFIX_TOKEN_LENGTH: int = 256
 
     def __init__(self, cache_config: CacheConfig):
         super().__init__(cache_config=cache_config)
@@ -65,12 +62,9 @@ def _get_model(self, helm_model_name: str) -> LoadedQwenModelProcessor:
             if loaded_model_processor is None:
                 hlog(f"Loading model {model_name} and caching in memory...")
                 model = Qwen2AudioForConditionalGeneration.from_pretrained(
-                    model_name,
-                    device_map=self._device,
+                    model_name, device_map=self._device, cache_dir="/data-2u-1/tuhq/hf_models"
                 ).eval()
-                tokenizer = AutoProcessor.from_pretrained(
-                    model_name,
-                )
+                tokenizer = AutoProcessor.from_pretrained(model_name, cache_dir="/data-2u-1/tuhq/hf_models")
                 _models[model_name] = LoadedQwenModelProcessor(model, tokenizer)
                 loaded_model_processor = _models[model_name]
 
@@ -84,11 +78,6 @@ def make_request(self, request: Request) -> RequestResult:
         model = loaded_model_processor.model
         tokenizer = loaded_model_processor.tokenizer
 
-        # Qwen2-Audio-Instruct counts input into the max_length, so we need to add the length of the prompt
-        generation_args = {
-            "max_length": request.max_tokens + self.PREFIX_TOKEN_LENGTH,
-        }
-
         input_query: List[Dict[str, Any]] = []
         query: List[Dict[str, str]] = []
         prompt_text: str = ""
@@ -142,10 +131,15 @@ def do_it() -> Dict[str, Any]:
                             return_tensors="pt",
                             padding=True,
                         )
+                        input_length = inputs.input_ids.size(1)
+                        # Qwen2-Audio-Instruct counts input into the max_length, so we need to add
+                        # the length of the prompt
                         inputs = inputs.to(self._device)
-                        pred = model.generate(**inputs, **generation_args)
-                        completion = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
+                        pred = model.generate(**inputs, max_length=request.max_tokens + input_length)[:, input_length:]
 
+                        completion = tokenizer.decode(
+                            pred.cpu()[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
+                        )
                         # The processor of Qwen2-Audio-Instruct consists an AutoTokenizer and a WhisperFeatureExtractor
                         tokens: List[str] = tokenizer.tokenizer.tokenize(completion)
                         return {"output": (completion, tokens)}
@@ -156,7 +150,7 @@ def do_it() -> Dict[str, Any]:
                             "completion_index": completion_index,
                             "model": request.model,
                             "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
-                            **generation_args,
+                            "max_tokens": request.max_tokens,
                         },
                         request=request,
                     )