andrei-kochin
diff --git a/‎.github/workflows/linux.yml
+1-1 b/‎.github/workflows/linux.yml
+1-1
diff --git a/‎llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py
+96 b/‎llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py
+96
diff --git a/‎llm_bench/python/who_what_benchmark/requirements.txt
+2-3 b/‎llm_bench/python/who_what_benchmark/requirements.txt
+2-3
diff --git a/‎llm_bench/python/who_what_benchmark/whowhatbench/evaluator.py
+29-5 b/‎llm_bench/python/who_what_benchmark/whowhatbench/evaluator.py
+29-5
diff --git a/‎samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark.cpp
+8-1 b/‎samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark.cpp
+8-1
diff --git a/‎src/cpp/include/openvino/genai/cache_eviction.hpp
+83 b/‎src/cpp/include/openvino/genai/cache_eviction.hpp
+83
diff --git a/‎src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+33-5 b/‎src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+33-5
@@ -189,7 +189,7 @@ jobs:
     if: |
       always() &&
       (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success')
-    timeout-minutes: 90
+    timeout-minutes: 120
     defaults:
       run:
         shell: bash
 
@@ -0,0 +1,96 @@
+from pathlib import PosixPath
+import os
+import tempfile
+
+import whowhatbench
+from whowhatbench.wwb import load_dataset
+from optimum.intel.openvino import OVModelForCausalLM
+
+from openvino_genai import ContinuousBatchingPipeline, SchedulerConfig, GenerationConfig, CacheEvictionConfig, AggregationMode
+
+from openvino_tokenizers import convert_tokenizer
+from openvino import serialize
+from transformers import AutoTokenizer
+
+model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+MAX_NEW_TOKENS = 128
+SEQS_PER_REQUEST = 5
+MAX_SEQUENCES = 100
+
+
+model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model_path = PosixPath(tempfile.gettempdir()) / model_id
+model.save_pretrained(model_path)
+
+ov_tokenizer, ov_detokenizer = convert_tokenizer(tokenizer, with_detokenizer=True, skip_special_tokens=True)
+serialize(ov_tokenizer, model_path / "openvino_tokenizer.xml")
+serialize(ov_detokenizer, model_path / "openvino_detokenizer.xml")
+
+scheduler_config_noopt = SchedulerConfig()
+scheduler_config_noopt.num_kv_blocks = 300
+scheduler_config_noopt.dynamic_split_fuse = True
+scheduler_config_noopt.max_num_batched_tokens = 256
+scheduler_config_noopt.max_num_seqs = 256
+scheduler_config_noopt.enable_prefix_caching = False
+
+scheduler_config_opt = SchedulerConfig()
+scheduler_config_opt.num_kv_blocks = 300
+scheduler_config_opt.dynamic_split_fuse = True
+scheduler_config_opt.max_num_batched_tokens = 256
+scheduler_config_opt.max_num_seqs = 256
+scheduler_config_opt.use_cache_eviction = True
+scheduler_config_opt.enable_prefix_caching = False
+eviction_config = CacheEvictionConfig(32, 32, 128, AggregationMode.NORM_SUM)
+scheduler_config_opt.cache_eviction_config = eviction_config
+
+generation_config = GenerationConfig()
+generation_config.num_return_sequences = 1
+generation_config.max_new_tokens = MAX_NEW_TOKENS
+
+data = load_dataset(path='squad', name=None, split='validation')["context"]
+data_dict = {"questions": list(dict({k: None for k in data}).keys())[:MAX_SEQUENCES]}
+
+model_cb_noopt = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config_noopt, "CPU", {})
+model_cb_opt = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config_opt, "CPU", {})
+
+
+GT_DATA_FILE = 'gt_data.csv'
+
+if os.path.exists(GT_DATA_FILE):
+    evaluator = whowhatbench.Evaluator(base_model=model_cb_noopt, gt_data=GT_DATA_FILE, tokenizer=tokenizer,
+                                       test_data=data_dict, generation_config=generation_config,
+                                       max_new_tokens=MAX_NEW_TOKENS, seqs_per_request=3)
+else:
+    evaluator = whowhatbench.Evaluator(base_model=model_cb_noopt, tokenizer=tokenizer, test_data=data_dict,
+                                       generation_config=generation_config, max_new_tokens=MAX_NEW_TOKENS,
+                                       seqs_per_request=3)
+    evaluator.dump_gt('gt_data.csv')
+
+
+all_metrics_per_question, all_metrics = evaluator.score(model_cb_opt)
+
+
+print(all_metrics_per_question)
+print(all_metrics)
+
+metrics = ["similarity", "SDT norm"]
+
+for metric in metrics:
+    worst_examples = evaluator.worst_examples(top_k=5, metric=metric)
+    print("Metric: ", metric)
+    for e in worst_examples:
+        print("\t=========================")
+        print(f"\t{metric}: ", e[metric])
+        print("\tPrompt: ", e["prompt"])
+        print("\tSource Model:\n ", "\t" + e["source_model"])
+        print("\tOptimized Model:\n ", "\t" + e["optimized_model"])
+
+pipeline_opt_metrics = model_cb_opt.get_metrics()
+pipeline_noopt_metrics = model_cb_noopt.get_metrics()
+
+print(f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}")
+print(f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}")
+max_optimization_ratio = (pipeline_noopt_metrics.max_cache_usage / pipeline_opt_metrics.max_cache_usage)
+avg_optimization_ratio = (pipeline_noopt_metrics.avg_cache_usage / pipeline_opt_metrics.avg_cache_usage)
+print(f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x")
@@ -1,10 +1,9 @@
 transformers>=4.35.2
 sentence-transformers>=2.2.2
 openvino>=2024.3.0
-openvino-telemetry>=2024.3.0
+openvino-telemetry
 optimum-intel>=1.14
-openvino-tokenizers>=2024.3.0
-openvino-genai>=2024.3.0
+openvino-tokenizers
 pandas>=2.0.3
 numpy>=1.23.5
 tqdm>=4.66.1
@@ -81,6 +81,8 @@ def autodetect_language(model):
         "internlm": "cn",
     }
 
+    if not hasattr(model, "config"):
+        return "en"
     return model2language.get(model.config.model_type, "en")
 
 
@@ -98,6 +100,9 @@ def __init__(
         num_samples=None,
         language=None,
         gen_answer_fn=None,
+        generation_config=None,
+        generation_config_base=None,
+        seqs_per_request=None
     ) -> None:
         assert (
             base_model is not None or gt_data is not None
@@ -109,6 +114,11 @@ def __init__(
         self.tokenizer = tokenizer
         self._crop_question = crop_question
         self.num_samples = num_samples
+        self.generation_config = generation_config
+        self.generation_config_base = generation_config
+        self.seqs_per_request = seqs_per_request
+        if self.generation_config is not None:
+            assert self.seqs_per_request is not None
 
         # Take language from the base model if provided
         self.language = language
@@ -117,7 +127,7 @@ def __init__(
                 self.language = autodetect_language(base_model)
 
         if base_model:
-            self.gt_data = self._generate_data(base_model, gen_answer_fn)
+            self.gt_data = self._generate_data(base_model, gen_answer_fn, generation_config=generation_config)
         else:
             self.gt_data = pd.read_csv(gt_data, keep_default_na=False)
 
@@ -139,7 +149,7 @@ def dump_gt(self, csv_name: str):
         self.gt_data.to_csv(csv_name)
 
     def score(self, model, gen_answer_fn=None):
-        predictions = self._generate_data(model, gen_answer_fn)
+        predictions = self._generate_data(model, gen_answer_fn, self.generation_config)
 
         all_metrics_per_question = {}
         all_metrics = {}
@@ -179,9 +189,10 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):
 
         return res
 
-    def _generate_data(self, model, gen_answer_fn=None):
+    def _generate_data(self, model, gen_answer_fn=None, generation_config=None):
         def default_gen_answer(model, tokenizer, question, max_new_tokens, crop_question):
             inputs = self.tokenizer(question, return_tensors="pt")
+
             tokens = model.generate(**inputs, max_new_tokens=max_new_tokens)
             out = self.tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
             return out[len(question) :] if crop_question else out
@@ -209,8 +220,21 @@ def default_gen_answer(model, tokenizer, question, max_new_tokens, crop_question
         answers = []
         prompts = questions.values if self.num_samples is None else questions.values[:self.num_samples]
 
-        for q in tqdm(prompts, desc="Evaluate pipeline"):
-            answers.append(gen_answer_fn(model, self.tokenizer, q, self.max_new_tokens, self._crop_question))
+        if generation_config is None:
+            for q in tqdm(prompts, desc="Evaluate pipeline"):
+                answers.append(gen_answer_fn(model, self.tokenizer, q, self.max_new_tokens, self._crop_question))
+        else:
+            with tqdm(total=len(questions.values)) as progress_bar:
+                batch = []
+                for q_idx, q in enumerate(questions.values):
+                    progress_bar.update(1)
+                    batch.append(q)
+                    if len(batch) == self.seqs_per_request or q_idx == len(questions.values) - 1:
+                        ans_batch = model.generate(batch, [generation_config] * len(batch))
+                        for ans in ans_batch:
+                            answers.append(ans.m_generation_ids[0])
+
+                        batch.clear()
 
         res_data = {"questions": list(prompts), "answers": answers}
         df = pd.DataFrame(res_data)
 
@@ -14,6 +14,7 @@
 #include <nlohmann/json.hpp>
 #include <cxxopts.hpp>
 
+#include "openvino/genai/cache_eviction.hpp"
 #include "openvino/genai/tokenizer.hpp"
 #include "openvino/genai/continuous_batching_pipeline.hpp"
 #include "openvino/genai/generation_handle.hpp"
@@ -440,6 +441,7 @@ int main(int argc, char* argv[]) try {
     ("cache_size", "Size of memory used for KV cache in GB. Default: 16", cxxopts::value<size_t>()->default_value("16"))
     ("device", "Target device to run the model. Default: CPU", cxxopts::value<std::string>()->default_value("CPU"))
     ("device_config", "Plugin configuration JSON. Example: '{\"MODEL_DISTRIBUTION_POLICY\":\"TENSOR_PARALLEL\",\"PERF_COUNT\":true}' Default: {\"PERF_COUNT\":true}", cxxopts::value<std::string>()->default_value("{\"PERF_COUNT\":true}"))
+    ("use_cache_eviction", "Whether to use cache eviction", cxxopts::value<bool>()->default_value("false"))
     ("h,help", "Print usage");
 
     cxxopts::ParseResult result;
@@ -467,6 +469,7 @@ int main(int argc, char* argv[]) try {
     const std::string device = result["device"].as<std::string>();
     const std::string device_config = result["device_config"].as<std::string>();
     const size_t cache_size = result["cache_size"].as<size_t>();
+    const bool use_cache_eviction = result["use_cache_eviction"].as<bool>();
 
     // Create requests for generation
     Dataset dataset = filtered_dataset(models_path, dataset_path, num_prompts, max_input_len, max_output_len);
@@ -486,7 +489,11 @@ int main(int argc, char* argv[]) try {
     scheduler_config.cache_size = cache_size,
     scheduler_config.block_size = get_default_block_size(device),
     scheduler_config.dynamic_split_fuse = dynamic_split_fuse,
-    scheduler_config.max_num_seqs = 256, // not used if dynamic_split_fuse=True
+    scheduler_config.max_num_seqs = 256; // not used if dynamic_split_fuse=True
+    if (use_cache_eviction) {
+        scheduler_config.use_cache_eviction = true;
+        scheduler_config.cache_eviction_config = ov::genai::CacheEvictionConfig(32, 32, 128, ov::genai::AggregationMode::NORM_SUM);
+    }
 
     std::cout << "Benchmarking parameters: " << std::endl;
     std::cout << "\tMax number of batched tokens: " << scheduler_config.max_num_batched_tokens << std::endl;
 
@@ -0,0 +1,83 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstddef>
+#include "openvino/openvino.hpp"
+
+namespace ov::genai {
+    /**
+    * @brief Represents the mode of per-token score aggregation when determining least important tokens for eviction
+    *        from cache
+    */
+    enum class AggregationMode {
+        SUM,     /**< In this mode the importance scores of each token will be summed after each step of generation */
+        NORM_SUM /**< Same as SUM, but the importance scores are additionally divided by the lifetime (in tokens generated)
+                    * of a given token in cache */
+    };
+
+    /**
+    * @brief Configuration struct for the cache eviction algorithm.
+    */
+    class CacheEvictionConfig {
+    public:
+        CacheEvictionConfig() {};
+        CacheEvictionConfig(size_t start_size, size_t recent_size, size_t max_cache_size, AggregationMode aggregation_mode_) : aggregation_mode(aggregation_mode_), m_start_size(start_size), m_recent_size(recent_size), m_max_cache_size(max_cache_size) {
+            OPENVINO_ASSERT(start_size, "CacheEvictionConfig.start_size must be non-zero");
+            OPENVINO_ASSERT(recent_size, "CacheEvictionConfig.recent_size must be non-zero");
+            OPENVINO_ASSERT(max_cache_size, "CacheEvictionConfig.max_cache_size must be non-zero");
+
+            OPENVINO_ASSERT(max_cache_size > (start_size + recent_size),
+                            "CacheEvictionConfig.max_cache_size must be larger than CacheEvictionConfig.start_size + CacheEvictionConfig.recent_size");
+            m_evictable_size = m_max_cache_size - m_start_size - m_recent_size;
+
+        }
+
+        /** @return Number of tokens between the "start" and "recent" areas of KV cache that
+         * will be considered for eviction. */
+        std::size_t get_start_size() const {
+            return m_start_size;
+        }
+
+        /** @return Number of tokens between the "start" and "recent" areas of KV cache that
+         * will be considered for eviction. */
+        std::size_t get_recent_size() const {
+            return m_recent_size;
+        }
+
+        /** @return Number of tokens between the "start" and "recent" areas of KV cache that
+         * will be considered for eviction. */
+        std::size_t get_max_cache_size() const {
+            return m_max_cache_size;
+        }
+
+        /** @return Number of tokens between the "start" and "recent" areas of KV cache that
+         * will be considered for eviction. */
+        std::size_t get_evictable_size() const {
+            return m_evictable_size;
+        }
+
+        /** The mode used to compute the importance of tokens for eviction */
+        AggregationMode aggregation_mode = AggregationMode::NORM_SUM;
+    private:
+        /** Number of tokens in the *beginning* of KV cache that should be retained
+ * in the KV cache for this sequence during generation. Must be non-zero and a multiple of the KV cache block size for
+ * this pipeline.*/
+        std::size_t m_start_size = 32;
+
+        /** Number of tokens in the *end* of KV cache that should be retained
+         * in the KV cache for this sequence during generation. Must be non-zero and a multiple of the KV cache block size for
+         * this pipeline.*/
+        std::size_t m_recent_size = 128;
+
+        /**
+         * @brief Maximum cache size (in tokens) that can be occupied by a sequence with cache eviction enabled.
+         * Actual occupied size may differ from this by no larger than (block_size) tokens.
+         * Eviction area is computed from this size and the "start"/"recent" area sizes.
+         * @return Total cache size (in tokens) allowed to be occupied by a sequence.
+         */
+        std::size_t m_max_cache_size = 672;
+        std::size_t m_evictable_size = 512;
+    };
+}
@@ -13,15 +13,39 @@
 #include "openvino/genai/llm_pipeline.hpp"
 #include "openvino/genai/streamer_base.hpp"
 #include "openvino/genai/visibility.hpp"
+#include "cache_eviction.hpp"
 
 namespace ov::genai {
-struct PipelineMetrics { 
-    // All requests as viewed by the pipeline
+
+/**
+ * @brief Contains general pipeline metrics, either aggregated throughout the lifetime of the generation pipeline
+ * or measured at the previous generation step.
+ */
+struct PipelineMetrics {
+    /**
+     * Number of requests to be processed by the pipeline.
+     */
     size_t requests = 0;
-    // Requests scheduled for processing
+
+    /**
+     * Number of requests that were scheduled for processing at the previous step of the pipeline.
+     */
     size_t scheduled_requests = 0;
-    // Percentage of KV cache usage
+
+    /**
+    * Percentage of KV cache usage in the last generation step.
+    */
     float cache_usage = 0.0;
+
+    /**
+    * Max KV cache usage during the lifetime of the pipeline in %
+    */
+    float max_cache_usage = 0.0;
+
+    /**
+    * Running average of the KV cache usage during the lifetime of the pipeline, with max window size of 1000 steps
+    */
+    float avg_cache_usage = 0.0;
 };
 
 class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
@@ -57,7 +81,11 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
 
     ov::genai::GenerationConfig get_config() const;
 
-    PipelineMetrics get_metrics() const;
+    /**
+     * Allows to get the current pipeline metrics.
+     * @return The struct with pipeline metrics for the previous generation step.
+     */
+    ov::genai::PipelineMetrics get_metrics() const;
 
     GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params);
     GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params);