Merge branch 'generate_pipeline' into archive

Wovchena · Wovchena · commit 34ed84db15d9 · 2024-05-16T20:16:16.000+04:00
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -74,7 +74,7 @@ jobs:
           tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
           tokenized = tokenizer('Why is the Sun yellow?', return_tensors='pt')
           for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
-              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
               if -1 == idx:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -90,7 +90,7 @@ jobs:
           tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
           tokenized = tokenizer('69', return_tensors='pt')
           for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
-              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
               if -1 == idx:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -106,7 +106,7 @@ jobs:
           tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
           tokenized = tokenizer('Hi', return_tensors='pt')
           for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
-              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
               if -1 == idx:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -122,7 +122,7 @@ jobs:
           tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
           tokenized = tokenizer('return 0', return_tensors='pt')
           for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
-              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
               if -1 == idx:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -138,7 +138,7 @@ jobs:
           tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
           tokenized = tokenizer('你好！ 你好嗎？', return_tensors='pt')
           for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
-              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
               if -1 == idx:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -160,7 +160,7 @@ jobs:
           for prompt in prompts:
             tokenized = tokenizer(prompt, return_tensors='pt')
             for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
-                ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+                ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
                 idx = predictions.find(ref)
                 if -1 == idx:
                     raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -201,7 +201,7 @@ jobs:
           echo tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') >> ref.py
           echo tokenized = tokenizer('69', return_tensors='pt') >> ref.py
           echo for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): >> ref.py
-          echo     ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' >> ref.py
+          echo     ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py
           echo     idx = predictions.find(ref) >> ref.py
           echo     if -1 == idx: >> ref.py
           echo         raise RuntimeError(f'Missing "{ref=}" from predictions') >> ref.py
@@ -347,7 +347,7 @@ jobs:
       - name: run and compare
         run: |
           source ./ov/setupvars.sh
-          ./build/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt
+          ./build/text_generation/causal_lm/cpp/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt
           ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt
           python -c "
           with open('predictions_greedy.txt', 'r') as f:
@@ -393,7 +393,7 @@ jobs:
           Question: Can you please add 2 and 3
           A:' > ./prompt.txt
 
-          ./build/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
+          ./build/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
           ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_greedy.txt
           python -c "
           with open('predictions_greedy.txt', 'r') as f:
@@ -441,7 +441,7 @@ jobs:
           tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5')
           tokenized = tokenizer('Alan Turing was a', return_tensors='pt')
           for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False):
-              ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
               if -1 == idx:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -486,7 +486,7 @@ jobs:
           tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat')
           tokenized = tokenizer('Alan Turing was a', return_tensors='pt')
           for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False):
-              ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
               if -1 == idx:
                   raise RuntimeError(f'Missing "{ref}" from predictions')
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
@@ -36,7 +36,12 @@ GenerationConfig::GenerationConfig(std::string json_path) {
     if (data.contains("repetition_penalty")) repetition_penalty = data["repetition_penalty"];
     if (data.contains("pad_token_id")) pad_token_id = data["pad_token_id"];
     if (data.contains("bos_token_id")) bos_token_id = data["bos_token_id"];
-    if (data.contains("eos_token_id")) eos_token_id = data["eos_token_id"];
+    
+    if (data.contains("eos_token_id") && data["eos_token_id"].type() == nlohmann::json::value_t::number_integer) {
+        // todo: qwen contains several eos_token_id
+        eos_token_id = data["eos_token_id"];
+    }
+
     if (data.contains("bos_token")) bos_token = data["bos_token"];
     if (data.contains("eos_token")) eos_token = data["eos_token"];
 
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
@@ -80,7 +80,7 @@ class Tokenizer::TokenizerImpl {
             m_bos_token_id = rt_info["bos_token_id"].as<int64_t>();
         if (rt_info.count("pad_token_id") > 0)
             m_pad_token_id = rt_info["pad_token_id"].as<int64_t>();
-    }
+        }
 
     std::pair<ov::Tensor, ov::Tensor> encode(std::string prompt) {
         size_t batch_size = 1;
@@ -94,6 +94,13 @@ class Tokenizer::TokenizerImpl {
         auto size_ = m_tokenize_request.get_input_tensor().get_shape();
         m_tokenize_request.infer();
         pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"), m_pad_token_id);
+        
+        // todo: fix mask filled with '2' instead of '0' 
+        // https://github.com/openvinotoolkit/openvino_tokenizers/pull/90 should've fixed this
+        ov::Tensor attention_mask = m_tokenize_request.get_tensor("attention_mask");
+        int64_t* attention_mask_data = attention_mask.data<int64_t>();
+        std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0);
+        
         return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
     }
 
diff --git a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
@@ -3,6 +3,10 @@
 
 #include <openvino_genai/llm_pipeline.hpp>
 
+namespace {
+    enum SPECIAL_TOKEN { PAD_TOKEN = 2 };
+}
+
 int main(int argc, char* argv[]) try {
     if (argc < 3) {
         throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> '<PROMPT 1>' ['<PROMPT 2>' ...]");
@@ -19,6 +23,9 @@ int main(int argc, char* argv[]) try {
     config.num_beams = 15;
     config.num_return_sequences = config.num_beams * prompts.size();
     
+    // workaround until pad_token_id is not written into IR
+    pipe.get_tokenizer().set_pad_token_id(PAD_TOKEN);
+    
     auto beams = pipe.generate(prompts, config);
     for (int i = 0; i < beams.scores.size(); i++)
         std::cout << beams.scores[i] << ": " << beams.texts[i] << '\n';