|
17 | 17 | from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
18 | 18 |
|
19 | 19 | from packaging import version
|
20 |
| -from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel |
| 20 | +from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, TFPreTrainedModel |
21 | 21 | from transformers.utils import is_tf_available
|
22 | 22 |
|
23 | 23 | from optimum.exporters.onnx.config import OnnxConfig, TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig
|
|
69 | 69 | JaisModelPatcher,
|
70 | 70 | LlamaModelPatcher,
|
71 | 71 | LlavaImageEmbeddingModelPatcher,
|
| 72 | + LlavaQwen2ImageEmbeddingsModelPatcher, |
72 | 73 | MiniCPMVImageEmbeddingsModelPatcher,
|
73 | 74 | MiniCPMVResamplerModelPatcher,
|
74 | 75 | MistralModelPatcher,
|
@@ -1218,8 +1219,8 @@ def patch_model_for_export(
|
1218 | 1219 |
|
1219 | 1220 |
|
1220 | 1221 | class LlavaConfigBehavior(str, enum.Enum):
|
1221 |
| - LANGUAGE = "language" |
1222 | 1222 | VISION_EMBEDDINGS = "vision_embeddings"
|
| 1223 | + LANGUAGE = "language" |
1223 | 1224 | TEXT_EMBEDDINGS = "text_embeddings"
|
1224 | 1225 |
|
1225 | 1226 |
|
@@ -1380,6 +1381,166 @@ class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig):
|
1380 | 1381 | MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
|
1381 | 1382 |
|
1382 | 1383 |
|
| 1384 | +@register_in_tasks_manager( |
| 1385 | + "llava-qwen2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers" |
| 1386 | +) |
| 1387 | +class LlavaQwen2OpenVINOConfig(OnnxConfig): |
| 1388 | + SUPPORTS_PAST = True |
| 1389 | + MIN_TRANSFORMERS_VERSION = version.parse("4.40.0") |
| 1390 | + SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaConfigBehavior] |
| 1391 | + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig |
| 1392 | + DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,) |
| 1393 | + |
| 1394 | + def __init__( |
| 1395 | + self, |
| 1396 | + config: "PretrainedConfig", |
| 1397 | + task: str = "feature-extraction", |
| 1398 | + int_dtype: str = "int64", |
| 1399 | + float_dtype: str = "fp32", |
| 1400 | + behavior: LlavaConfigBehavior = LlavaConfigBehavior.VISION_EMBEDDINGS, |
| 1401 | + preprocessors: Optional[List[Any]] = None, |
| 1402 | + use_past: bool = False, |
| 1403 | + ): |
| 1404 | + self._behavior = behavior |
| 1405 | + self._orig_config = config |
| 1406 | + if self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1407 | + config = AutoConfig.from_pretrained(config.mm_vision_tower, trust_remote_code=True) |
| 1408 | + if hasattr(config, "vision_config"): |
| 1409 | + config = config.vision_config |
| 1410 | + super().__init__( |
| 1411 | + config=config, |
| 1412 | + task=task, |
| 1413 | + int_dtype=int_dtype, |
| 1414 | + float_dtype=float_dtype, |
| 1415 | + preprocessors=preprocessors, |
| 1416 | + ) |
| 1417 | + |
| 1418 | + @property |
| 1419 | + def inputs(self) -> Dict[str, Dict[int, str]]: |
| 1420 | + if not self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1421 | + return {} |
| 1422 | + return {"pixel_values": {0: "batch_size", 2: "height", 3: "width"}} |
| 1423 | + |
| 1424 | + @property |
| 1425 | + def outputs(self) -> Dict[str, Dict[int, str]]: |
| 1426 | + if not self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1427 | + return {} |
| 1428 | + return {"last_hidden_state": {0: "batch_size"}} |
| 1429 | + |
| 1430 | + def get_model_for_behavior(self, model, behavior: Union[str, LlavaConfigBehavior]): |
| 1431 | + if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior): |
| 1432 | + behavior = LlavaConfigBehavior(behavior) |
| 1433 | + |
| 1434 | + if behavior == LlavaConfigBehavior.LANGUAGE: |
| 1435 | + model.forward = super(type(model), model).forward |
| 1436 | + return model |
| 1437 | + |
| 1438 | + if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1439 | + return model |
| 1440 | + |
| 1441 | + if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS: |
| 1442 | + text_embedding = model.model.embed_tokens |
| 1443 | + text_embedding.config = model.model.config |
| 1444 | + return text_embedding |
| 1445 | + |
| 1446 | + def with_behavior( |
| 1447 | + self, |
| 1448 | + behavior: Union[str, LlavaConfigBehavior], |
| 1449 | + ): |
| 1450 | + """ |
| 1451 | + Creates a config for different behaviour. |
| 1452 | +
|
| 1453 | + Args: |
| 1454 | + behavior ([`ConfigBehavior`]): |
| 1455 | + The behavior to use for the new instance. |
| 1456 | + """ |
| 1457 | + if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior): |
| 1458 | + behavior = LlavaConfigBehavior(behavior) |
| 1459 | + |
| 1460 | + if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS: |
| 1461 | + model_type = self._orig_config.model_type.replace("llava-", "") |
| 1462 | + model_type = model_type.replace("_", "-") |
| 1463 | + if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: |
| 1464 | + raise ValueError( |
| 1465 | + f"Unsupported language model type provided `{model_type}`. Please define custom export config" |
| 1466 | + ) |
| 1467 | + |
| 1468 | + if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]: |
| 1469 | + raise ValueError( |
| 1470 | + f"Export config for text generation for `{model_type}` is not available. Please define custom export config" |
| 1471 | + ) |
| 1472 | + internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][ |
| 1473 | + "text-generation-with-past" |
| 1474 | + ] |
| 1475 | + internal_export_config = internal_export_config_class( |
| 1476 | + self._orig_config, |
| 1477 | + use_past=True, |
| 1478 | + use_past_in_inputs=True, |
| 1479 | + int_dtype=self.int_dtype, |
| 1480 | + float_dtype=self.float_dtype, |
| 1481 | + ) |
| 1482 | + InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS |
| 1483 | + export_config = InputEmbedOpenvVINOConfig( |
| 1484 | + self._orig_config, |
| 1485 | + task="feature-extraction", |
| 1486 | + int_dtype=self.int_dtype, |
| 1487 | + float_dtype=self.float_dtype, |
| 1488 | + ) |
| 1489 | + return export_config |
| 1490 | + |
| 1491 | + if behavior == LlavaConfigBehavior.LANGUAGE: |
| 1492 | + model_type = self._orig_config.model_type.replace("llava-", "") |
| 1493 | + model_type = model_type.replace("_", "-") |
| 1494 | + |
| 1495 | + if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: |
| 1496 | + raise ValueError( |
| 1497 | + f"Unsupported language model type provided `{model_type}`. Please define custom export config" |
| 1498 | + ) |
| 1499 | + |
| 1500 | + if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]: |
| 1501 | + raise ValueError( |
| 1502 | + f"Export config for text generation for `{model_type}` is not available. Please define custom export config" |
| 1503 | + ) |
| 1504 | + internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][ |
| 1505 | + "text-generation-with-past" |
| 1506 | + ] |
| 1507 | + internal_export_config = internal_export_config_class( |
| 1508 | + self._orig_config, |
| 1509 | + use_past=True, |
| 1510 | + use_past_in_inputs=True, |
| 1511 | + int_dtype=self.int_dtype, |
| 1512 | + float_dtype=self.float_dtype, |
| 1513 | + ) |
| 1514 | + export_config = LMInputEmbedsConfigHelper(internal_export_config) |
| 1515 | + export_config._normalized_config = internal_export_config._normalized_config |
| 1516 | + return export_config |
| 1517 | + |
| 1518 | + if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1519 | + return self.__class__( |
| 1520 | + self._orig_config, |
| 1521 | + task=self.task, |
| 1522 | + int_dtype=self.int_dtype, |
| 1523 | + float_dtype=self.float_dtype, |
| 1524 | + behavior=behavior, |
| 1525 | + preprocessors=self._preprocessors, |
| 1526 | + ) |
| 1527 | + |
| 1528 | + def patch_model_for_export( |
| 1529 | + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None |
| 1530 | + ): |
| 1531 | + model_kwargs = model_kwargs or {} |
| 1532 | + if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1533 | + return super().patch_model_for_export(model, model_kwargs) |
| 1534 | + return LlavaQwen2ImageEmbeddingsModelPatcher(self, model, model_kwargs) |
| 1535 | + |
| 1536 | + def rename_ambiguous_inputs(self, inputs): |
| 1537 | + if self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1538 | + model_inputs = {} |
| 1539 | + model_inputs["images"] = inputs["pixel_values"] |
| 1540 | + return model_inputs |
| 1541 | + return super().rename_ambiguous_inputs(inputs) |
| 1542 | + |
| 1543 | + |
1383 | 1544 | class InternVLChatConfigBehavior(str, enum.Enum):
|
1384 | 1545 | LANGUAGE = "language"
|
1385 | 1546 | VISION_EMBEDDINGS = "vision_embeddings"
|
@@ -1508,8 +1669,8 @@ def with_behavior(
|
1508 | 1669 | preprocessors=self._preprocessors,
|
1509 | 1670 | )
|
1510 | 1671 |
|
1511 |
| - def get_model_for_behavior(self, model, behavior: Union[str, LlavaConfigBehavior]): |
1512 |
| - if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior): |
| 1672 | + def get_model_for_behavior(self, model, behavior: Union[str, InternVLChatConfigBehavior]): |
| 1673 | + if isinstance(behavior, str) and not isinstance(behavior, InternVLChatConfigBehavior): |
1513 | 1674 | behavior = InternVLChatConfigBehavior(behavior)
|
1514 | 1675 |
|
1515 | 1676 | if behavior == InternVLChatConfigBehavior.LANGUAGE:
|
|
0 commit comments