Upated real models

ilya-lavrenov · ilya-lavrenov · commit e7fd50ff15b8 · 2024-05-15T02:49:17.000+04:00
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/paged_attention_transformations.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/paged_attention_transformations.cpp
@@ -21,10 +21,6 @@ void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, Dev
 
     const ov::ParameterVector& parameters = model->get_parameters();
 
-    for (auto param : parameters) {
-        std::cout << param->get_friendly_name() << " " << param->get_partial_shape() << std::endl;
-    }
-
     // extract num_kv_heads and head_size
     size_t kv_caches_inputs_offset = 2;
     ov::PartialShape k_shape = parameters[kv_caches_inputs_offset]->get_partial_shape();
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
@@ -17,6 +17,7 @@ def get_greedy() -> GenerationConfig:
     generation_config.num_return_sequences = 1
     return generation_config
 
+
 def get_beam_search() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_groups = 3
@@ -25,6 +26,7 @@ def get_beam_search() -> GenerationConfig:
     generation_config.num_return_sequences = generation_config.num_groups * generation_config.group_size
     return generation_config
 
+
 def get_test_dataset() -> Tuple[List[str], List[GenerationConfig]]:
     prompts = [
         "What is OpenVINO?",
@@ -40,6 +42,7 @@ def get_test_dataset() -> Tuple[List[str], List[GenerationConfig]]:
     ]
     return (prompts, generation_configs)
 
+
 def get_scheduler_config(scheduler_params: dict = None) -> SchedulerConfig:
     scheduler_config = SchedulerConfig()
     if scheduler_params is None:
@@ -54,6 +57,7 @@ def get_scheduler_config(scheduler_params: dict = None) -> SchedulerConfig:
 
     return scheduler_config
 
+
 def convert_to_hf(
     default_generation_config : HFGenerationConfig,
     generation_config : GenerationConfig
@@ -91,6 +95,7 @@ def convert_to_hf(
     hf_generation_config = HFGenerationConfig(**kwargs)
     return hf_generation_config
 
+
 def run_hugging_face(
     model_id : str,
     prompts: List[str],
@@ -117,7 +122,7 @@ def run_hugging_face(
         inputs = hf_tokenizer(prompt, return_tensors="pt")
         prompt_len = len(inputs['input_ids'][0])
         generate_outputs = model.generate(**inputs, generation_config=convert_to_hf(model.generation_config, generation_config), return_dict_in_generate=True)
-        all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences])
+        all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True)
 
         generation_result = GenerationResult()
         generation_result.m_generation_ids = all_text_batch
@@ -126,16 +131,23 @@ def run_hugging_face(
             generation_result.m_scores = [score for score in generate_outputs.sequences_scores]
         generation_results.append(generation_result)
 
+    del hf_tokenizer
+    del model
+
     return (generation_results, model_path)
 
+
 def run_continuous_batching(
     model_path : Path,
     scheduler_config : SchedulerConfig,
     prompts: List[str],
     generation_configs : List[GenerationConfig]
 ) -> List[GenerationResult]:
     pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config)
-    return pipe.generate(prompts, generation_configs)
+    output = pipe.generate(prompts, generation_configs)
+    del pipe
+    return output
+
 
 def get_models_list(file_name: str):
     models = []
@@ -148,6 +160,7 @@ def get_models_list(file_name: str):
             models.append(model_name)
     return models
 
+
 def compare_results(hf_result, ov_result, generation_config):
     if generation_config.is_beam_search:
         assert len(hf_result.m_scores) == len(ov_result.m_scores)
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
@@ -4,71 +4,73 @@ microsoft/phi-2
 microsoft/phi-1_5
 EleutherAI/gpt-neo-125m
 EleutherAI/gpt-neo-125m
+EleutherAI/gpt-neo-1.3B
+EleutherAI/gpt-j-6b
+# EleutherAI/gpt-neox-20b
 BAAI/AquilaChat2-7B
 BAAI/Aquila-7B
 BAAI/AquilaChat-7B
-baichuan-inc/Baichuan2-13B-Chat
+baichuan-inc/Baichuan2-7B-Chat
 baichuan-inc/Baichuan-7B
-bigscience/bloomz-7b1
-bigscience/bloomz
+bigscience/bloomz-1b7
+bigscience/bloomz-560m
 THUDM/chatglm2-6b
 THUDM/chatglm3-6b
-CohereForAI/c4ai-command-r-v01
-databricks/dbrx-base
-databricks/dbrx-instruct
+databricks/dolly-v2-3b
 tiiuae/falcon-7b
 tiiuae/falcon-rw-7b
 google/gemma-2b
 google/gemma-7b
+openai-community/gpt2
+openai-community/gpt2-xl
 gpt2
 gpt2-xl
-bigcode/starcoder
+bigcode/starcoderbase-3b
+bigcode/starcoder2-3b
 bigcode/gpt_bigcode-santacoder
-EleutherAI/gpt-j-6b
 nomic-ai/gpt4all-j
-EleutherAI/gpt-neox-20b
-databricks/dolly-v2-12b
-stabilityai/stablelm-tuned-alpha-7b
-internlm/internlm-7b
+nomic-ai/gpt4all-mpt
+nomic-ai/gpt4all-falcon
+stabilityai/stablelm-3b-4e1t
+stabilityai/stablelm-2-zephyr-1_6b
 internlm/internlm-chat-7b
 internlm/internlm2-7b
-internlm/internlm2-chat-7b
-core42/jais-13b
-core42/jais-13b-chat
+# core42/jais-13b
+# core42/jais-13b-chat
+meta-llama/Llama-2-7b-hf
 meta-llama/Meta-Llama-3-8B-Instruct
-lmsys/vicuna-13b-v1.3
-young-geng/koala
-openlm-research/open_llama_13b
+lmsys/vicuna-7b-v1.3
+lmsys/vicuna-7b-v1.5
+# young-geng/koala
+openlm-research/open_llama_3b
+openlm-research/open_llama_3b_v2
+openbmb/MiniCPM-V-2
 openbmb/MiniCPM-2B-sft-bf16
 openbmb/MiniCPM-2B-dpo-bf16
 mistralai/Mistral-7B-v0.1
 mistralai/Mistral-7B-Instruct-v0.1
-mistralai/Mixtral-8x7B-v0.1
-mistralai/Mixtral-8x7B-Instruct-v0.1
+# mistralai/Mixtral-8x7B-v0.1
+# mistralai/Mixtral-8x7B-Instruct-v0.1
+mosaicml/mpt-1b-redpajama-200b
 mosaicml/mpt-7b
-mosaicml/mpt-30b
+# mosaicml/mpt-30b
 allenai/OLMo-1B-hf
 allenai/OLMo-7B-hf
-OrionStarAI/Orion-14B-Base
-OrionStarAI/Orion-14B-Chat
+# OrionStarAI/Orion-14B-Base
+# OrionStarAI/Orion-14B-Chat
 Qwen/Qwen-7B
 Qwen/Qwen-7B-Chat
-Qwen/Qwen1.5-7B
+Qwen/Qwen1.5-0.5B
 Qwen/Qwen1.5-7B-Chat
-Qwen/Qwen1.5-MoE-A2.7B
-Qwen/Qwen1.5-MoE-A2.7B-Chat
-stabilityai/stablelm-3b-4e1t
-stabilityai/stablelm-base-alpha-7b-v2
-bigcode/starcoder2-3b
-bigcode/starcoder2-7b
-bigcode/starcoder2-15b
+# Qwen/Qwen1.5-MoE-A2.7B
+# Qwen/Qwen1.5-MoE-A2.7B-Chat
 xverse/XVERSE-7B-Chat
-xverse/XVERSE-13B-Chat
+# xverse/XVERSE-MoE-A4.2B
 01-ai/Yi-6B
-01-ai/Yi-34B
 Salesforce/codegen-350M-multi
-EleutherAI/gpt-j-6b
-EleutherAI/gpt-neo-125m
+Salesforce/codegen-350M-nl
 rinna/bilingual-gpt-neox-4b
 facebook/opt-350m
+facebook/incoder-1B
+google/pegasus-big_patent
 google/pegasus-large
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt
@@ -7,4 +7,8 @@ git+https://github.com/huggingface/optimum-intel.git@main
 pytest
 pytest-html
 # set 'export HF_HUB_ENABLE_HF_TRANSFER=1' to benefits from hf_transfer
-hf_transfer
+hf_transfer
+
+# requirements for specific models
+# - hf-tiny-model-private/tiny-random-RoFormerForCausalLM
+rjieba
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py
@@ -16,11 +16,13 @@
 def test_sampling_precommit(tmp_path, model_id):
     run_test_pipeline(tmp_path, model_id)
 
+
 @pytest.mark.nightly
 @pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly")))
 def test_sampling_nightly(tmp_path, model_id):
     run_test_pipeline(tmp_path, model_id)
 
+
 @pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models")))
 def test_real_models(tmp_path, model_id):
     run_test_pipeline(tmp_path, model_id)