Fix weight_only algorithms import (#1742)

Kaihui-intel · pre-commit-ci[bot] · web-flow · commit e87c95f25d3f · 2024-04-23T10:58:31.000+08:00
Signed-off-by: Kaihui-intel &lt;kaihui.tang@intel.com&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt
@@ -9,5 +9,5 @@ wandb
 einops
 neural-compressor
 intel-extension-for-transformers
-git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
+lm-eval
 peft
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py
@@ -50,8 +50,7 @@
                     help="Pad input ids to max length.")
 parser.add_argument("--calib_iters", default=512, type=int,
                     help="calibration iters.")
-parser.add_argument("--tasks", nargs='+', default=["lambada_openai",
-                                                   "hellaswag", "winogrande", "piqa", "wikitext"],
+parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext",
                     type=str, help="tasks list for accuracy validation")
 parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
 # ============SmoothQuant configs==============
@@ -390,24 +389,27 @@ def run_fn(model):
 
 if args.accuracy:
     user_model.eval()
-    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate
-
-    results = evaluate(
-        model="hf-causal",
+    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    eval_args = LMEvalParser(
+        model="hf",
         model_args='pretrained=' + args.model + ',tokenizer=' + args.model + ',dtype=float32',
         user_model=user_model,
+        tokenizer = tokenizer,
         batch_size=args.batch_size,
         tasks=args.tasks,
+        device="cpu",
     )
+    results = evaluate(eval_args)
+
     dumped = json.dumps(results, indent=2)
     if args.save_accuracy_path:
         with open(args.save_accuracy_path, "w") as f:
             f.write(dumped)
-    for task_name in args.tasks:
+    for task_name in args.tasks.split(","):
         if task_name == "wikitext":
-            acc = results["results"][task_name]["word_perplexity"]
+            acc = results["results"][task_name]["word_perplexity,none"]
         else:
-            acc = results["results"][task_name]["acc"]
+            acc = results["results"][task_name]["acc,none"]
     print("Accuracy: %.5f" % acc)
     print('Batch size = %d' % args.batch_size)
 
@@ -417,21 +419,25 @@ def run_fn(model):
     import time
 
     samples = args.iters * args.batch_size
-    start = time.time()
-    results = evaluate(
-        model="hf-causal",
+    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    eval_args = LMEvalParser(
+        model="hf",
         model_args='pretrained=' + args.model + ',tokenizer=' + args.model + ',dtype=float32',
         user_model=user_model,
+        tokenizer = tokenizer,
         batch_size=args.batch_size,
         tasks=args.tasks,
         limit=samples,
+        device="cpu",
     )
+    start = time.time()
+    results = evaluate(eval_args)
     end = time.time()
-    for task_name in args.tasks:
+    for task_name in args.tasks.split(","):
         if task_name == "wikitext":
-            acc = results["results"][task_name]["word_perplexity"]
+            acc = results["results"][task_name]["word_perplexity,none"]
         else:
-            acc = results["results"][task_name]["acc"]
+            acc = results["results"][task_name]["acc,none"]
     print("Accuracy: %.5f" % acc)
     print('Throughput: %.3f samples/sec' % (samples / (end - start)))
     print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
diff --git a/neural_compressor/torch/algorithms/weight_only/__init__.py b/neural_compressor/torch/algorithms/weight_only/__init__.py
@@ -11,12 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .rtn import rtn_quantize
-from .gptq import gptq_quantize
-from .awq import awq_quantize
-from .teq import teq_quantize
-from .autoround import autoround_quantize
-from .hqq import hqq_quantize
-from .modules import WeightOnlyLinear
-from .utility import *
diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -69,7 +69,7 @@ def __init__(
             bits = self.dtype.lstrip("int")
             self.dtype = "int"
         if "int" not in self.dtype:  # for nf4, fp4
-            from neural_compressor.torch.algorithms.weight_only import FLOAT_MAPPING, INT_MAPPING
+            from neural_compressor.torch.algorithms.weight_only.utility import FLOAT_MAPPING, INT_MAPPING
 
             self.use_optimum_format = False  # optimum_format doesn't suit for symmetric nf4 fp4.
             float_list = FLOAT_MAPPING[self.dtype]
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -40,7 +40,7 @@ def rtn_entry(
     model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], RTNConfig], *args, **kwargs
 ) -> torch.nn.Module:
     """The main entry to apply rtn quantization."""
-    from neural_compressor.torch.algorithms.weight_only import rtn_quantize
+    from neural_compressor.torch.algorithms.weight_only.rtn import rtn_quantize
 
     # rebuild weight_config for rtn_quantize function
     weight_config = {}
@@ -75,7 +75,7 @@ def gptq_entry(
     model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], GPTQConfig], *args, **kwargs
 ) -> torch.nn.Module:
     logger.info("Quantize model with the GPTQ algorithm.")
-    from neural_compressor.torch.algorithms.weight_only import gptq_quantize
+    from neural_compressor.torch.algorithms.weight_only.gptq import gptq_quantize
 
     # rebuild weight_config for gptq_quantize function
     weight_config = {}
@@ -228,7 +228,7 @@ def awq_quantize_entry(
     model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], AWQConfig], *args, **kwargs
 ) -> torch.nn.Module:
     logger.info("Quantize model with the AWQ algorithm.")
-    from neural_compressor.torch.algorithms.weight_only import awq_quantize
+    from neural_compressor.torch.algorithms.weight_only.awq import awq_quantize
 
     weight_config = {}
     for (op_name, op_type), op_config in configs_mapping.items():
@@ -288,7 +288,7 @@ def awq_quantize_entry(
 def teq_quantize_entry(
     model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], TEQConfig], *args, **kwargs
 ) -> torch.nn.Module:
-    from neural_compressor.torch.algorithms.weight_only import teq_quantize
+    from neural_compressor.torch.algorithms.weight_only.teq import teq_quantize
 
     logger.info("Quantize model with the TEQ algorithm.")
     weight_config = {}
@@ -338,7 +338,7 @@ def teq_quantize_entry(
 def autoround_quantize_entry(
     model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], AutoRoundConfig], *args, **kwargs
 ) -> torch.nn.Module:
-    from neural_compressor.torch.algorithms.weight_only import autoround_quantize
+    from neural_compressor.torch.algorithms.weight_only.autoround import autoround_quantize
 
     logger.info("Quantize model with the AutoRound algorithm.")
     calib_func = kwargs.get("run_fn", None)
@@ -407,7 +407,7 @@ def autoround_quantize_entry(
 def hqq_entry(
     model: torch.nn.Module, configs_mapping: Dict[Tuple[str, Callable], HQQConfig], *args, **kwargs
 ) -> torch.nn.Module:
-    from neural_compressor.torch.algorithms.weight_only import hqq_quantize
+    from neural_compressor.torch.algorithms.weight_only.hqq import hqq_quantize
 
     logger.info("Quantize model with the HQQ algorithm.")
     q_model = hqq_quantize(model, configs_mapping)
diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py
@@ -4,7 +4,7 @@
 import torch
 import transformers
 
-from neural_compressor.torch.algorithms.weight_only import WeightOnlyLinear
+from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear
 from neural_compressor.torch.quantization import GPTQConfig, get_default_gptq_config, get_default_rtn_config, quantize
 
 
diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py
@@ -4,7 +4,7 @@
 import torch
 import transformers
 
-from neural_compressor.torch.algorithms.weight_only import WeightOnlyLinear
+from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear
 from neural_compressor.torch.quantization import (
     RTNConfig,
     get_default_double_quant_config,