Demo for chunk streaming (openvinotoolkit#1320)

zhaohb · web-flow · commit b8a84b8ea2c8 · 2024-12-11T18:14:50.000+04:00
Add python chat example for chunk streaming
diff --git a/samples/python/multinomial_causal_lm/README.md b/samples/python/multinomial_causal_lm/README.md
@@ -32,6 +32,8 @@ This Python example demonstrates custom detokenization with bufferization. The s
 
 To address this, the detokenizer needs a larger context. We accumulate tokens in a tokens_cache buffer and decode multiple tokens together, adding the text to the streaming queue only when a complete decoded chunk is ready. We run a separate thread to print all new elements arriving in this queue from the generation pipeline. Each generated chunk of text is put into a synchronized queue, ensuring that all put and get operations are thread-safe and blocked until they can proceed.
 
+At the same time, in order to optimize the performance in streaming mode, we provide the Chuck Streaming. Chunk streaming has significant benefits to very small LLM for streaming generate token rate improvement. It does sampling once after several token generation. We can use the tokens_len parameter to control the number of tokens in the token_cache before sampling.
+
 ### Troubleshooting
 
 #### Unicode characters encoding error on Windows
diff --git a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py b/samples/python/multinomial_causal_lm/multinomial_causal_lm.py
@@ -120,23 +120,41 @@ def end(self):
         self.put_word(None)
 
 
+class ChunkStreamer(IterableStreamer):
+
+    def __init__(self, tokenizer, tokens_len):
+        super().__init__(tokenizer)
+        self.tokens_len = tokens_len
+
+    def put(self, token_id: int) -> bool:
+        if (len(self.tokens_cache) + 1) % self.tokens_len != 0:
+            self.tokens_cache.append(token_id)
+            return False
+        return super().put(token_id)
+
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('model_dir')
     parser.add_argument('prompt')
     args = parser.parse_args()
 
     device = 'CPU'  # GPU can be used as well
+    tokens_len = 10  # chunk size
     pipe = openvino_genai.LLMPipeline(args.model_dir, device)
-    
-    text_print_streamer = IterableStreamer(pipe.get_tokenizer())
+
+    text_print_streamer = ChunkStreamer(
+        pipe.get_tokenizer(),
+        tokens_len
+    )
+
     def token_printer():
         # Getting next elements from iterable will be blocked until a new token is available.
         for word in text_print_streamer:
             print(word, end='', flush=True)
     printer_thread = threading.Thread(target=token_printer, daemon=True)
     printer_thread.start()
-    
+
     config = openvino_genai.GenerationConfig()
     config.max_new_tokens = 100
     config.do_sample = True
@@ -148,5 +166,6 @@ def token_printer():
     pipe.generate(args.prompt, config, text_print_streamer)
     printer_thread.join()
 
+
 if '__main__' == __name__:
     main()