huggingface
diff --git a/‎.github/workflows/test_openvino.yml
+5 b/‎.github/workflows/test_openvino.yml
+5
diff --git a/‎.github/workflows/test_openvino_full.yml
+5 b/‎.github/workflows/test_openvino_full.yml
+5
diff --git a/‎.github/workflows/test_openvino_slow.yml
+5 b/‎.github/workflows/test_openvino_slow.yml
+5
diff --git a/‎docs/source/openvino/export.mdx
+17-3 b/‎docs/source/openvino/export.mdx
+17-3
diff --git a/‎optimum/commands/export/openvino.py
+66-5 b/‎optimum/commands/export/openvino.py
+66-5
@@ -50,6 +50,11 @@ jobs:
         name: Install specific dependencies and versions required for older transformers
         run: |
           pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.30.* transformers_stream_generator
+      
+      - if: ${{ matrix.transformers-version == 'latest' && matrix.test-pattern == '*modeling*'}}
+        name: Install auto-gptq, autoawq
+        run: |
+          pip install auto-gptq autoawq --extra-index-url https://download.pytorch.org/whl/cpu
 
       - if: ${{ matrix.test-pattern == '*modeling*' }}
         name: Uninstall NNCF
 
@@ -78,6 +78,11 @@ jobs:
         if: ${{ matrix.transformers-version != 'latest' }}
         run: pip install transformers==${{ matrix.transformers-version }}
 
+      - if: ${{ matrix.transformers-version == 'latest' && matrix.os != 'windows-2019' }}
+        name: Install auto-gptq, autoawq
+        run: |
+          pip install auto-gptq autoawq --extra-index-url https://download.pytorch.org/whl/cpu
+
       - name: Pip freeze
         run: pip freeze
 
 
@@ -49,6 +49,11 @@ jobs:
         name: Install specific dependencies and versions required for older transformers
         run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*, diffusers==0.30.* transformers_stream_generator
 
+      - if: ${{ matrix.transformers-version == 'latest' && matrix.os != 'windows-2019' }}
+        name: Install auto-gptq, autoawq
+        run: |
+          pip install auto-gptq autoawq --extra-index-url https://download.pytorch.org/whl/cpu
+
       - name: Pip freeze
         run: pip freeze
 
 
@@ -31,13 +31,14 @@ Check out the help for more options:
 
 ```text
 usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code]
-                                   [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}]
+                                   [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8}]
                                    [--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
                                    [--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
                                    [--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
                                    [--dataset DATASET] [--all-layers] [--awq] [--scale-estimation] [--gptq]
                                    [--lora-correction] [--sensitivity-metric SENSITIVITY_METRIC]
                                    [--num-samples NUM_SAMPLES] [--disable-stateful] [--disable-convert-tokenizer]
+                                   [--smooth-quant-alpha SMOOTH_QUANT_ALPHA]
                                    output
 
 optional arguments:
@@ -66,6 +67,10 @@ Optional arguments:
                         on your local machine arbitrary code present in the model repository.
   --weight-format {fp32,fp16,int8,int4,mxfp4,nf4}
                         The weight format of the exported model.
+  --quant-mode {int8}
+                        Quantization precision mode. This is used for applying full model quantization including
+                        activations. The only currently supported choice is 'int8' for int8 quantization of both
+                        weights and activations.
   --library {transformers,diffusers,timm,sentence_transformers,open_clip}
                         The library used to load the model before export. If not provided, will attempt to infer the
                         local checkpoint's library
@@ -102,8 +107,8 @@ Optional arguments:
                         weight compression is applied, they are compressed to INT8.
   --awq                 Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but
                         requires additional time for tuning weights on a calibration dataset. To run AWQ, please also
-                        provide a dataset argument. Note: it is possible that there will be no matching patterns in the
-                        model to apply AWQ, in such case it will be skipped.
+                        provide a dataset argument. Note: it is possible that there will be no matching patterns in
+                        the model to apply AWQ, in such case it will be skipped.
   --scale-estimation    Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between
                         the original and compressed layers. Providing a dataset is required to run scale estimation.
                         Please note, that applying scale estimation takes additional memory and time.
@@ -128,6 +133,9 @@ Optional arguments:
                         OpenVINO native inference code that expects KV-cache inputs and outputs in the model.
   --disable-convert-tokenizer
                         Do not add converted tokenizer and detokenizer OpenVINO models.
+  --smooth-quant-alpha SMOOTH_QUANT_ALPHA
+                        SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers
+                        and reduces quantization error. Valid only when activations quantization is enabled.
 ```
 
 You can also apply fp16, 8-bit or 4-bit weight-only quantization on the Linear, Convolutional and Embedding layers when exporting your model by setting `--weight-format` to respectively `fp16`, `int8` or `int4`.
@@ -158,6 +166,12 @@ Models larger than 1 billion parameters are exported to the OpenVINO format with
 </Tip>
 
 
+Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to `int8`. This will quantize both weights and activations of Linear, Convolutional and some other layers to int8. Currently this is only supported for speech-to-text models. Please see example below.
+
+```bash
+optimum-cli export openvino -m openai/whisper-large-v3-turbo --quant-mode int8 --dataset librispeech --num-samples 32 --smooth-quant-alpha 0.9 ./whisper-large-v3-turbo
+```
+
 ### Decoder models
 
 For models with a decoder, we enable the re-use of past keys and values by default. This allows to avoid recomputing the same intermediate activations at each generation step. To export the model without, you will need to remove the `-with-past` suffix when specifying the task.
 
@@ -75,6 +75,16 @@ def parse_args_openvino(parser: "ArgumentParser"):
         default=None,
         help="The weight format of the exported model.",
     )
+    optional_group.add_argument(
+        "--quant-mode",
+        type=str,
+        choices=["int8"],
+        default=None,
+        help=(
+            "Quantization precision mode. This is used for applying full model quantization including activations. "
+            "The only currently supported choice is 'int8' for int8 quantization of both weights and activations."
+        ),
+    )
     optional_group.add_argument(
         "--library",
         type=str,
@@ -228,6 +238,15 @@ def parse_args_openvino(parser: "ArgumentParser"):
         action="store_true",
         help="Do not add converted tokenizer and detokenizer OpenVINO models.",
     )
+    optional_group.add_argument(
+        "--smooth-quant-alpha",
+        type=float,
+        default=None,
+        help=(
+            "SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and "
+            "reduces quantization error. Valid only when activations quantization is enabled."
+        ),
+    )
 
 
 def no_compression_parameter_provided(args):
@@ -252,6 +271,20 @@ def no_compression_parameter_provided(args):
     )
 
 
+def no_quantization_parameter_provided(args):
+    return all(
+        (
+            it is None
+            for it in (
+                args.sym,
+                args.dataset,
+                args.num_samples,
+                args.smooth_quant_alpha,
+            )
+        )
+    )
+
+
 class OVExportCommand(BaseOptimumCLICommand):
     COMMAND = CommandInfo(name="openvino", help="Export PyTorch models to OpenVINO IR.")
 
@@ -291,16 +324,21 @@ def run(self):
         else:
             library_name = self.args.library
 
-        if self.args.weight_format is None:
+        if self.args.weight_format is None and self.args.quant_mode is None:
             ov_config = None
             if not no_compression_parameter_provided(self.args):
                 raise ValueError(
                     "Some compression parameters are provided, but the weight format is not specified. "
                     "Please provide it with --weight-format argument."
                 )
+            if not no_quantization_parameter_provided(self.args):
+                raise ValueError(
+                    "Some quantization parameters are provided, but the quantization mode is not specified. "
+                    "Please provide it with --quant-mode argument."
+                )
         elif self.args.weight_format in {"fp16", "fp32"}:
             ov_config = OVConfig(dtype=self.args.weight_format)
-        else:
+        elif self.args.weight_format is not None:
             # For int4 quantization if no parameter is provided, then use the default config if exists
             if no_compression_parameter_provided(self.args) and self.args.weight_format == "int4":
                 quantization_config = get_default_int4_config(self.args.model)
@@ -326,6 +364,21 @@ def run(self):
             if quantization_config.get("dataset", None) is not None:
                 quantization_config["trust_remote_code"] = self.args.trust_remote_code
             ov_config = OVConfig(quantization_config=quantization_config)
+        else:
+            if self.args.quant_mode != "int8":
+                raise ValueError("Only 'int8' quantization mode is currently supported.")
+
+            quantization_config = {
+                "weight_format": self.args.quant_mode,
+                "activation_format": self.args.quant_mode,
+                "bits": 8,
+                "sym": self.args.sym or False,
+                "dataset": self.args.dataset,
+                "num_samples": self.args.num_samples,
+                "smooth_quant_alpha": self.args.smooth_quant_alpha,
+                "trust_remote_code": self.args.trust_remote_code,
+            }
+            ov_config = OVConfig(quantization_config=quantization_config)
 
         quantization_config = ov_config.quantization_config if ov_config else None
         quantize_with_dataset = quantization_config and getattr(quantization_config, "dataset", None) is not None
@@ -368,17 +421,25 @@ def run(self):
             model.save_pretrained(self.args.output)
             if not self.args.disable_convert_tokenizer:
                 maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
-        elif (task.startswith("text-generation") or task == "image-text-to-text") and quantize_with_dataset:
+        elif (
+            quantize_with_dataset
+            and (task.startswith("text-generation") or task == "automatic-speech-recognition")
+            or (task == "image-text-to-text" and quantization_config is not None)
+        ):
             if task.startswith("text-generation"):
                 from optimum.intel import OVModelForCausalLM
 
                 model_cls = OVModelForCausalLM
-            else:
+            elif task == "image-text-to-text":
                 from optimum.intel import OVModelForVisualCausalLM
 
                 model_cls = OVModelForVisualCausalLM
+            else:
+                from optimum.intel import OVModelForSpeechSeq2Seq
+
+                model_cls = OVModelForSpeechSeq2Seq
 
-            # To quantize a model with a dataset, an instance of a model class is required
+            # In this case, to apply quantization an instance of a model class is required
             model = model_cls.from_pretrained(
                 self.args.model,
                 export=True,