Align length_penalty

as-suvorov · as-suvorov · commit 717f311d5d59 · 2024-03-14T15:57:34.000+01:00
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -6,7 +6,7 @@ on:
       - llm_bench/python/**
       - text_generation/causal_lm/cpp/*
       - thirdparty/openvino_tokenizers
-      - '!**.md'
+      - "!**.md"
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
@@ -94,6 +94,38 @@ jobs:
               predictions = predictions[:idx] + predictions[idx + len(ref):]
           "
           echo Hi passed
+
+          timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "return 0" > ./pred.txt
+          python -c "
+          import transformers
+          with open('pred.txt', 'r') as file:
+              predictions = file.read()
+          tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
+          tokenized = tokenizer('return 0', return_tensors='pt')
+          for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              idx = predictions.find(ref)
+              if -1 == idx:
+                  raise RuntimeError(f'Missing "{ref=}" from predictions')
+              predictions = predictions[:idx] + predictions[idx + len(ref):]
+          "
+          echo return 0 passed
+
+          ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "你好！ 你好嗎？" > ./pred.txt
+          python -c "
+          import transformers
+          with open('pred.txt', 'r') as file:
+              predictions = file.read()
+          tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
+          tokenized = tokenizer('你好！ 你好嗎？', return_tensors='pt')
+          for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              idx = predictions.find(ref)
+              if -1 == idx:
+                  raise RuntimeError(f'Missing "{ref=}" from predictions')
+              predictions = predictions[:idx] + predictions[idx + len(ref):]
+          "
+          echo 你好！ 你好嗎？ passed
   cpp-beam_search_causal_lm-windows:
     runs-on: windows-latest
     steps:
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,41 @@
+# build/artifact dirs
+_*
+[Bb]uild*/
+cmake-build*
+
+# but ensure we don't skip __init__.py and __main__.py
+!__init__.py
+!__main__.py
+
+# developer tools
+*.idea
+.vscode
+.vs/
+.vsconan/
+.DS_Store
+**/tags
+compile_commands.json
+bin/
+.local_vimrc
+.gdb_history
+.vimspector.json
+doc/
+docs/build_documentation/work_dir/
+temp/
+.repo/
+CMakeLists.txt.user
+docs/IE_PLUGIN_DG/html/
+CMakeUserPresets.json
+
+*.project
+*.cproject
+*.pydevproject
+*.settings
+*/gen/
+*.swp
+/config.xml
+
+# Python-specific
+*.?env*
+*.pyc
+__pycache__
diff --git a/text_generation/causal_lm/cpp/.clang-format b/text_generation/causal_lm/cpp/.clang-format
@@ -0,0 +1,28 @@
+BasedOnStyle: Google
+IndentWidth: 4
+UseTab: Never
+ColumnLimit: 120
+
+Language: Cpp
+Standard: Cpp11
+
+AccessModifierOffset: -4
+AlignConsecutiveMacros: true
+AllowAllArgumentsOnNextLine: false
+AllowAllConstructorInitializersOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: Empty
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakBeforeMultilineStrings: false
+BinPackArguments: false
+BinPackParameters: false
+CommentPragmas: '^#'
+DerivePointerAlignment: false
+FixNamespaceComments: true
+IndentCaseLabels: false
+IndentPPDirectives: AfterHash
+ForEachMacros:
+  - foreach
+  - FOREACH_CHILD
diff --git a/text_generation/causal_lm/cpp/group_beam_searcher.hpp b/text_generation/causal_lm/cpp/group_beam_searcher.hpp
@@ -44,7 +44,10 @@ std::vector<int64_t> kmp_search(const std::vector<int64_t>& haystack, const std:
     return res;
 }
 
-struct Token {float log_prob; int64_t idx;};
+struct Token {
+    float log_prob;
+    int64_t idx;
+};
 
 std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx) {
     if (logits.get_shape().at(0) <= batch_idx) {
@@ -55,10 +58,10 @@ std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx) {
     size_t sequence_offset = (logits.get_shape().at(1) - 1) * vocab_size;
     const float* beam_logits = logits.data<const float>() + batch_offset + sequence_offset;
     float max_logit = *std::max_element(beam_logits, beam_logits + vocab_size);
-    float log_sum = std::log(std::accumulate(
-        beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) {
+    float log_sum = std::log(
+        std::accumulate(beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) {
             return accumulated + std::exp(to_add - max_logit);
-    }));
+        }));
     std::vector<Token> tokens;
     tokens.reserve(vocab_size);
     for (size_t idx = 0; idx < vocab_size; ++idx) {
@@ -77,7 +80,7 @@ bool greater(const Beam& left, const Beam& right) {
     return left.score > right.score;
 }
 
-enum class StopCriteria {early, heuristic, never};
+enum class StopCriteria { early, heuristic, never };
 
 struct Parameters {
     std::vector<int64_t> prompt;
@@ -90,14 +93,24 @@ struct Parameters {
     size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
     // There's no way to extract special token values from the tokenizer for now
     int64_t eos_token = 2;
-    std::function<bool(const Beam&)> early_finish = [](const Beam&){return false;};
+    std::function<bool(const Beam&)> early_finish = [](const Beam&) {
+        return false;
+    };
 };
 
 struct Group {
-    std::vector<Beam> ongoing;  // Best beams in front
+    std::vector<Beam> ongoing;   // Best beams in front
     std::vector<Beam> min_heap;  // The worst of the best completed beams is the first
     bool done = false;
-    void finish(Beam&& beam, const Parameters& parameters) {
+
+    // finalize parameter introduced to match huggingface implementation
+    void finish(Beam&& beam, const Parameters& parameters, const bool finalize = false) {
+        size_t cur_len = ongoing.front().tokens.size();
+
+        if (!finalize) {
+            cur_len += 1;
+        }
+
         beam.score /= std::pow(float(parameters.prompt.size() + beam.tokens.size()), parameters.length_penalty);
         min_heap.push_back(std::move(beam));
         std::push_heap(min_heap.begin(), min_heap.end(), greater);
@@ -110,30 +123,34 @@ struct Group {
         if (min_heap.size() < parameters.group_size) {
             return;
         }
-        size_t cur_len = parameters.prompt.size() + ongoing.front().tokens.size();
+        size_t cur_len = ongoing.front().tokens.size() + 1;
         float best_sum_logprobs = ongoing.front().score;
         float worst_score = min_heap.front().score;
         switch (parameters.stop_criteria) {
-            case StopCriteria::early:
-                done = true;
-                return;
-            case StopCriteria::heuristic: {
-                float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty);
-                done = worst_score >= highest_attainable_score;
-                return;
-            }
-            case StopCriteria::never: {
-                size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len;
-                float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty);
-                done = worst_score >= highest_attainable_score;
-                return;
-            }
-            default: throw std::runtime_error("Never reached");
+        case StopCriteria::early:
+            done = true;
+            return;
+        case StopCriteria::heuristic: {
+            float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty);
+            done = worst_score >= highest_attainable_score;
+            return;
+        }
+        case StopCriteria::never: {
+            size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len;
+            float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty);
+            done = worst_score >= highest_attainable_score;
+            return;
+        }
+        default:
+            throw std::runtime_error("Never reached");
         }
     }
 };
 
-struct TokenToBeam {int64_t token_idx; int32_t beam_idx;};
+struct TokenToBeam {
+    int64_t token_idx;
+    int32_t beam_idx;
+};
 
 // GroupBeamSearcher processes logits prduced by a language model and accumulates beams using group beam search
 // algorithm. select_next_tokens() returns token ids selected by the algorithm and corresponding beam ids. These values
@@ -173,7 +190,7 @@ struct GroupBeamSearcher {
                 continue;
             }
             std::vector<Beam> candidates;
-            candidates.reserve(2 * parameters.group_size);
+            candidates.reserve(parameters.group_size * 2 * parameters.group_size);
             for (const Beam& beam : group->ongoing) {
                 std::vector<Token> tokens = log_softmax(logits, beam.global_beam_idx);
                 for (auto prev_group = groups.cbegin(); prev_group != group; ++prev_group) {
@@ -251,7 +268,7 @@ std::vector<std::vector<Beam>> finalize(GroupBeamSearcher&& group_beam_searcher)
     for (Group& group : group_beam_searcher.groups) {
         if (!group.done) {
             for (Beam& beam : group.ongoing) {
-                group.finish(std::move(beam), group_beam_searcher.parameters);
+                group.finish(std::move(beam), group_beam_searcher.parameters, true);
             }
         }
         finalized.push_back(std::move(group.min_heap));