add support cb

eaidova · eaidova · commit f58216df685f · 2024-09-26T16:12:56.000+04:00
diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py
@@ -703,7 +703,9 @@ def get_argprser():
     )
     parser.add_argument('-od', '--output_dir', help='Save the input text and generated text, images to files')
     llm_bench_utils.model_utils.add_stateful_model_arguments(parser)
-    parser.add_argument("--genai", action="store_true")
+    parser.add_argument("--genai", action="store_true", help="Use OpenVINO GenAI optimized pipelines for benchmarking")
+    parser.add_argument("--use_cb", action="store_true", help="Use Continious Batching inference mode")
+    parser.add_argument("--cb_config", required=False, default=None, help="Path to file with Continious batching settings")
     parser.add_argument(
         '--end_token_stopping',
         action='store_true',
diff --git a/llm_bench/python/llm_bench_utils/model_utils.py b/llm_bench/python/llm_bench_utils/model_utils.py
@@ -136,6 +136,7 @@ def analyze_args(args):
     model_args['subsequent'] = args.subsequent
     model_args['output_dir'] = args.output_dir
     model_args['genai'] = args.genai
+    model_args["use_cb"] = args.use_cb
     model_args['devices'] = args.device
     model_args['prompt_index'] = [] if args.prompt_index is not None else None
     if model_args['prompt_index'] is not None:
@@ -164,6 +165,13 @@ def analyze_args(args):
         log.info(f"PT Config={model_args['config']}")
     model_args['model_type'] = get_model_type(model_name, use_case, model_framework)
     model_args['model_name'] = model_name
+
+    if args.use_cb and not args.genai:
+        raise RuntimeError("Continious batching mode supported only via OpenVINO GenAI")
+    cb_config = None
+    if args.cb_config:
+        cb_config = get_config(args.cb_config)
+    model_args["cb_config"] = cb_config
     return model_path, model_framework, model_args, model_name
 
 
diff --git a/llm_bench/python/llm_bench_utils/ov_utils.py b/llm_bench/python/llm_bench_utils/ov_utils.py
@@ -144,7 +144,7 @@ def create_text_gen_model(model_path, device, **kwargs):
         raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist')
     else:
         if kwargs.get("genai", False) and is_genai_available(log_msg=True):
-            if model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type], OV_MODEL_CLASSES_MAPPING["mpt"]]:
+            if model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type], OV_MODEL_CLASSES_MAPPING["mpt"], OV_MODEL_CLASSES_MAPPING["chatglm"]]:
                 log.warning("OpenVINO GenAI based benchmarking is not available for {model_type}. Will be switched to default bencmarking")
             else:
                 return create_genai_text_gen_model(model_path, device, ov_config, **kwargs)
@@ -185,8 +185,19 @@ def create_genai_text_gen_model(model_path, device, ov_config, **kwargs):
         convert_ov_tokenizer(model_path)
 
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    start = time.perf_counter()
 
+    cb = kwargs.get("use_cb", False)
+    if cb:
+        log.info("Continius Batching mode activated")
+        scheduler_config = openvino_genai.SchedulerConfig()
+        scheduler_params = kwargs.get("cb_config") or {"cache_size": 1}
+        if scheduler_params:
+            log.info(f"Scheduler parameters:\n{scheduler_params}")
+        
+            for param, value in scheduler_params.items():
+                setattr(scheduler_config, param, value)
+        ov_config["scheduler_config"] = scheduler_config
+    start = time.perf_counter()
     llm_pipe = openvino_genai.LLMPipeline(str(model_path), device.upper(), ov_config)
     end = time.perf_counter()
     log.info(f'Pipeline initialization time: {end - start:.2f}s')