Skip to content

Commit 28a574f

Browse files
authored
Merge branch 'master' into xinhe/fix
2 parents a1f67b0 + adf0ca7 commit 28a574f

File tree

12 files changed

+33
-18
lines changed

12 files changed

+33
-18
lines changed

.azure-pipelines/template/docker-template.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ steps:
7474
7575
- ${{ if eq(parameters.imageSource, 'pull') }}:
7676
- script: |
77-
docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
77+
docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu24.04/habanalabs/pytorch-installer-2.5.1:latest
7878
displayName: "Pull habana docker image"
7979
8080
- script: |
@@ -95,7 +95,7 @@ steps:
9595
else
9696
docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \
9797
--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host \
98-
-v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
98+
-v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.19.0/ubuntu24.04/habanalabs/pytorch-installer-2.5.1:latest
9999
fi
100100
echo "Show the container list after docker run ... "
101101
docker ps -a

.azure-pipelines/ut-3x-pt-fp8.yml

+2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ pr:
1515
- neural_compressor/torch
1616
- test/3x/torch/algorithms/fp8_quant
1717
- test/3x/torch/quantization/fp8_quant
18+
- test/3x/torch/quantization/weight_only/test_rtn.py
19+
- test/3x/torch/quantization/weight_only/test_load.py
1820
- setup.py
1921
- requirements_pt.txt
2022

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ Following example code demonstrates FP8 Quantization, it is supported by Intel G
7878
To try on Intel Gaudi2, docker image with Gaudi Software Stack is recommended, please refer to following script for environment setup. More details can be found in [Gaudi Guide](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#launch-docker-image-that-was-built).
7979
```bash
8080
# Run a container with an interactive shell
81-
docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
81+
docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.19.0/ubuntu24.04/habanalabs/pytorch-installer-2.5.1:latest
8282
```
8383
Run the example:
8484
```python

examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements-autoround-hpu.txt

+2-3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,5 @@ lm_eval==0.4.3
1111
peft
1212
numba
1313
tbb
14-
# TODO: (Yi) SW-208079 replace auto-round with the released version
15-
auto-round-hpu @ git+https://github.com/intel/auto-round.git@hpu_only_pkg
16-
optimum-habana==1.14.1
14+
auto-round @ git+https://github.com/intel/auto-round.git@v0.4.2
15+
optimum-habana==1.14.1

examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh

+3-1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ function init_params {
1414
batch_size=16
1515
tuned_checkpoint=saved_results
1616
task=lambada_openai
17+
incbench_cmd="incbench --num_cores_per_instance 4"
1718
echo ${max_eval_samples}
1819
for var in "$@"
1920
do
@@ -104,6 +105,7 @@ function run_benchmark {
104105
elif [ "${topology}" = "opt_125m_woq_autoround_int4_hpu" ]; then
105106
model_name_or_path="facebook/opt-125m"
106107
extra_cmd=$extra_cmd" --woq_algo AutoRound"
108+
incbench_cmd="incbench --num_instances 1"
107109
elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then
108110
model_name_or_path="facebook/opt-125m"
109111
fi
@@ -116,7 +118,7 @@ function run_benchmark {
116118
--batch_size ${batch_size} \
117119
${extra_cmd} ${mode_cmd}
118120
elif [[ ${mode} == "performance" ]]; then
119-
incbench --num_cores_per_instance 4 run_clm_no_trainer.py \
121+
${incbench_cmd} run_clm_no_trainer.py \
120122
--model ${model_name_or_path} \
121123
--batch_size ${batch_size} \
122124
--output_dir ${tuned_checkpoint} \

examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py

+10-7
Original file line numberDiff line numberDiff line change
@@ -270,8 +270,9 @@ def get_user_model():
270270
torchscript = True
271271
if args.woq_algo == "AutoRound" and is_habana_framework_installed():
272272
print("Quantizing model with AutoRound on HPU")
273-
check_torch_compile_with_hpu_backend()
274-
set_envs_for_torch_compile_with_hpu_backend()
273+
if args.quantize:
274+
check_torch_compile_with_hpu_backend()
275+
set_envs_for_torch_compile_with_hpu_backend()
275276
user_model = AutoModelForCausalLM.from_pretrained(
276277
args.model,
277278
trust_remote_code=args.trust_remote_code,
@@ -403,11 +404,12 @@ def calib_func(prepared_model):
403404
max_seq_length=args.gptq_max_seq_length,
404405
)
405406
dataloader_for_calibration = dataloaderPreprocessor.get_prepared_dataloader()
406-
from neural_compressor.torch.algorithms.weight_only.utility import move_input_to_device
407+
from neural_compressor.torch.utils import get_model_device, move_input_device
407408
from tqdm import tqdm
408409
def run_fn_for_gptq(model, dataloader_for_calibration, *args):
409410
for batch in tqdm(dataloader_for_calibration):
410-
batch = move_input_to_device(batch, device=None)
411+
device = get_model_device(model)
412+
batch = move_input_device(batch, device=device)
411413
if isinstance(batch, tuple) or isinstance(batch, list):
412414
model(batch[0])
413415
elif isinstance(batch, dict):
@@ -525,11 +527,12 @@ def run_fn_for_autoround(model, dataloader):
525527
)
526528
dataloader = dataloaderPreprocessor.get_prepared_dataloader()
527529
custom_tune_config = TuningConfig(config_set=get_woq_tuning_config())
528-
from neural_compressor.torch.algorithms.weight_only.utility import move_input_to_device
530+
from neural_compressor.torch.utils import get_model_device, move_input_device
529531
from tqdm import tqdm
530532
def run_fn_for_gptq(model, dataloader_for_calibration, *args):
531533
for batch in tqdm(dataloader_for_calibration):
532-
batch = move_input_to_device(batch, device=None)
534+
device = get_model_device(model)
535+
batch = move_input_device(batch, device=device)
533536
if isinstance(batch, tuple) or isinstance(batch, list):
534537
model(batch[0])
535538
elif isinstance(batch, dict):
@@ -568,7 +571,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
568571

569572

570573
if is_hpex_available():
571-
from habana_frameworks.torch.hpu import wrap_in_hpu_graph
574+
from habana_frameworks.torch.hpu.graphs import wrap_in_hpu_graph
572575
user_model = user_model.to(torch.bfloat16)
573576
wrap_in_hpu_graph(user_model, max_graphs=10)
574577

neural_compressor/common/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
dump_elapsed_time,
2828
)
2929
from neural_compressor.common.base_config import options
30-
30+
from neural_compressor.common.version import __version__
3131

3232
__all__ = [
3333
"options",

neural_compressor/common/version.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../version.py

neural_compressor/evaluation/lm_eval/models/huggingface.py

+2
Original file line numberDiff line numberDiff line change
@@ -969,6 +969,8 @@ def _model_call(self, inps, attn_mask=None, labels=None):
969969
output = output.logits
970970
if self.pad_to_buckets and padding_length != 0: # use buckets to pad inputs
971971
output = output[:, :-padding_length, :]
972+
if "hpu" in output.device.type: # make sure return fp32 tensor for HPU, TODO: root cause
973+
output = output.to(torch.float32)
972974
return output
973975

974976
def _model_generate(self, context, max_length, stop, **generation_kwargs):

test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_config_json.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Use this module as an example of how to write new unit tests for layers."""
22

33
import os
4-
4+
import sys
55
import pytest
66
import torch
77

@@ -58,7 +58,12 @@ def run_predefined_config():
5858
run_with_raised_exception(run_predefined_config, FileNotFoundError, "Failed to load file ")
5959
# TODO [SW-196641]: fix the following issue:
6060
elif quant_mode == QuantMode.SHAPE:
61-
run_with_raised_exception(run_predefined_config, UnboundLocalError, "local variable 'fname_base' referenced before assignment")
61+
error_message = (
62+
"cannot access local variable 'fname_base' where it is not associated with a value"
63+
if sys.version_info >= (3, 11)
64+
else "local variable 'fname_base' referenced before assignment"
65+
)
66+
run_with_raised_exception(run_predefined_config, UnboundLocalError, error_message)
6267
else:
6368
run_predefined_config()
6469

test/3x/torch/quantization/weight_only/test_rtn.py

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from neural_compressor.torch.utils import accelerator, is_hpex_available
1717

1818
device = accelerator.current_device_name()
19+
torch.set_grad_enabled(False)
1920

2021

2122
class ModelConv1d(torch.nn.Module):

test/3x/torch/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
auto_round
2-
deepspeed @ git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
2+
deepspeed @ git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
33
expecttest
44
intel_extension_for_pytorch
55
numpy

0 commit comments

Comments
 (0)