openvinotoolkit · apaniukov · Mar 24, 2025 · Mar 21, 2025 · Mar 24, 2025
diff --git a/src/bytes_to_chars.cpp b/src/bytes_to_chars.cpp
@@ -289,10 +289,7 @@ bool BytesToChars::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
     auto chars  = inputs[4].data<const uint8_t>();
 
     const bool has_skips = inputs.size() == 6;
-    bool * skips;
-    if (has_skips) {
-        skips = inputs[5].data<bool>();
-    };
+    auto skips = has_skips ? inputs[5].data<bool>() : nullptr;
 
     // Set output shapes
     outputs[0] = inputs[0];
@@ -340,4 +337,3 @@ bool BytesToChars::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
     outputs[4].set_shape({char_pointer});
     return true;
 }
-
diff --git a/src/regex_split.cpp b/src/regex_split.cpp
@@ -25,11 +25,11 @@ const std::map<std::string, RegexSplit::SplitMode> split_modes_map = {
 
 void RegexSplit::compile_pattern_if_necessary(std::string split_pattern) const {
     m_split_mode = split_modes_map.at(m_behaviour);
-    
+
     if (m_search_pattern_pcre2) {
         return;
     }
-    
+
     if (m_behaviour == "contiguous" && split_pattern[split_pattern.length() - 1] != '+') {
         std::stringstream tmp_stream;
         tmp_stream << "(" << split_pattern << ")+";
@@ -149,7 +149,7 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
         std::lock_guard<std::mutex> lock(m_mutex);
         compile_pattern_if_necessary(split_pattern);
     }
-    
+
     auto get_next_match = [this](const std::string& str, size_t curr_start) -> std::optional<std::pair<size_t, size_t>>{
         auto match = this->m_search_pattern_pcre2->match(str, curr_start);
         if (match.first != SIZE_MAX && match.first != match.second) {
@@ -186,15 +186,15 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
     auto ends   = inputs[3].data<const int32_t>();
     auto chars  = inputs[4].data<const uint8_t>();
     const size_t num_rows = inputs[0].get_size();
-    bool * skips;
-    bool init_skips = false;
+    const bool *skips;
+    Tensor skips_t, new_skips_t;
+
     if (has_skips) {
         skips = inputs[5].data<bool>();
         outputs[5].set_shape(Shape{max_shape});
     } else {
-        skips = new bool[num_rows];
-        init_skips = true;
-        std::fill(skips, skips + num_rows, false);
+        skips_t = Tensor(element::boolean, Shape{num_rows});
+        skips = std::fill_n(skips_t.data<bool>(), num_rows, false) - num_rows;
     };
 
     outputs[0].set_shape(inputs[0].get_shape());
@@ -213,7 +213,8 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
     if (has_skips) {
         new_skips = outputs[5].data<bool>();
     } else {
-        new_skips = new bool[max_shape];
+        new_skips_t = Tensor(element::boolean, Shape{max_shape});
+        new_skips = new_skips_t.data<bool>();
     };
     int32_t ragged_offset = 0;
 
@@ -234,7 +235,7 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
             } else {
                 size_t start = 0;
                 uint32_t num_splits = 0;
-               
+
                 size_t last_begin = -1;
                 auto add_split = [&](int begin, int end, bool invert) {
                     switch (m_split_mode) {
@@ -274,14 +275,14 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
                         end = str.length();
                     };
                     new_ends[ragged_offset++] = begins[ragged_col] + end;
-                    
+
                     ++num_splits;
                 };
 
                 std::optional<std::pair<size_t, size_t>> match;
                 while ((match = get_next_match(str, start)) != std::nullopt) {
                     auto [curr_start, curr_end] = *match;
-                    
+
                     if (curr_start != start) {
                         if (has_skips) {
                             new_skips[ragged_offset] = false;
@@ -314,10 +315,6 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
     if (has_skips) {
         outputs[5].set_shape({size_t(ragged_offset)});
     };
-    if (init_skips) {
-        delete[] skips;
-        delete[] new_skips;
-    };
 
     return true;
 }
diff --git a/src/special_tokens_split.cpp b/src/special_tokens_split.cpp
@@ -74,16 +74,16 @@ bool SpecialTokensSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVec
     const size_t batch_size = inputs[0].get_size();
     const size_t num_chars = inputs[4].get_size();
 
-    bool * skips;
-    bool init_skips = false;
+    Tensor skips_alternative;
+    const bool *skips;
     if (has_skips) {
         skips = inputs[5].data<bool>();
         outputs[5].set_shape(Shape{num_chars});
     } else {
         outputs[5].set_shape(Shape{num_chars});
-        skips = new bool[batch_size];
-        init_skips = true;
-        std::fill(skips, skips + batch_size, false);
+        skips_alternative = Tensor(element::boolean, Shape{batch_size});
+        skips = std::fill_n(skips_alternative.data<bool>(), batch_size, false) -
+                batch_size;
     };
 
     outputs[0].set_shape(inputs[0].get_shape());
@@ -145,8 +145,5 @@ bool SpecialTokensSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVec
     outputs[3].set_shape({size_t(ragged_offset)});
     outputs[5].set_shape({size_t(ragged_offset)});
 
-    if (init_skips) {
-        delete[] skips;
-    };
     return true;
 }
diff --git a/src/utf8_validate.cpp b/src/utf8_validate.cpp
@@ -18,26 +18,26 @@ void UTF8Validate::validate_and_infer_types() {
 bool UTF8Validate::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
     auto begins = inputs[0].data<int32_t>();
     auto ends   = inputs[1].data<int32_t>();
-    uint8_t* bytes  = inputs[2].data<uint8_t>();
+    auto bytes = inputs[2].data<uint8_t>();
     auto begins_shape = inputs[0].get_shape();
     auto chars_shape = inputs[2].get_shape();
-    
+
     const unsigned char replacement_symbol[] = {0xEF, 0xBF, 0xBD};  // UTF-8 encoding for "�"
     outputs[0].set_shape(begins_shape);
     outputs[1].set_shape(begins_shape);
-    
+
     // One byte can be replaced by 3 bytes at most,
     // therefore need to allocate more space
     size_t last_axis = chars_shape.size() - 1;
     chars_shape[last_axis] = chars_shape[last_axis] * 3;
     outputs[2].set_shape(chars_shape);
-    
+
     auto out_begins = outputs[0].data<int32_t>();
     auto out_ends   = outputs[1].data<int32_t>();
     auto out_bytes  = outputs[2].data<uint8_t>();
 
     // UTF-8 code points should not intersect:
-    // if 2 byte object has code point < 0x80 then it's not valid 2 byte utf-8, 
+    // if 2 byte object has code point < 0x80 then it's not valid 2 byte utf-8,
     // even if it has a valid bit mask.
     const uint32_t code_point_starts[4] = {0x0, 0x80, 0x800, 0x10000};
     uint32_t utf_code_point;
@@ -49,7 +49,7 @@ bool UTF8Validate::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
         // Flag indicating whether UTF8 symbol is complete: true means it's complete, false means we expect continuation.
         // bool new_symbol_flag = true;
         bytes_to_consume = 0;
-        
+
         out_begins[i] = out_idx;
         for (size_t j = begins[i]; j < ends[i]; j += 1) {
             // Beggining of the symbol.
@@ -70,7 +70,7 @@ bool UTF8Validate::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
                 utf_code_point = (0b1111 & bytes[j]) << 6 * bytes_to_consume;
                 continue;
             } else if (!bytes_to_consume && bytes[j] >> 3 == 0b11110) {
-                num_bytes = 4; 
+                num_bytes = 4;
                 bytes_to_consume = 3;
                 utf_code_point = (0b111 & bytes[j]) << 6 * bytes_to_consume;
                 continue;

diff --git a/src/utils.cpp b/src/utils.cpp
@@ -36,8 +36,8 @@ void check_string_input(const Node* node, size_t input_index) {
 void check_string_scalar_input(const Node* node, size_t input_index) {
     auto shape = node->get_input_partial_shape(input_index);
     auto element_type = node->get_input_element_type(input_index);
-    
-    #if false && USE_STRING_TENSORS
+
+#if false && USE_STRING_TENSORS
     // This block is not used when we convert ops to decomposed representation (and we really do)
     OPENVINO_ASSERT(
         (element_type == element::dynamic || element_type == element::string) &&
@@ -117,7 +117,7 @@ void unpack_strings_to_tensors (const std::string* strings, const Shape shape, o
 }
 
 void override_parameter (std::shared_ptr<ov::Node> node, element::Type type, const PartialShape& shape) {
-    if (auto parameter = std::dynamic_pointer_cast<Parameter>(node)) { 
+    if (auto parameter = std::dynamic_pointer_cast<Parameter>(node)) {
         // TODO: Apply this change conditionally based on real Parameter value
         if (getenv_bool("OPENVINO_TOKENIZERS_PRINT_DEBUG_INFO", false)) {
             std::cerr << "Overriding Parameter element_type to " << type << " and shape " << shape << "\n";
@@ -170,10 +170,7 @@ bool evaluate_normalization_helper (ov::TensorVector& outputs, const ov::TensorV
     auto ends   = inputs[1].data<const int32_t>();
     auto chars  = inputs[2].data<const uint8_t>();
 
-    bool * skips;
-    if (has_skips) {
-        skips = inputs[3].data<bool>();
-    };
+    auto skips = has_skips ? inputs[3].data<bool>() : nullptr;
 
     // Set output shapes
     outputs[0].set_shape(inputs[0].get_shape());
@@ -276,7 +273,7 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
     }
     pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(m_compiled, NULL);
     PCRE2_SIZE subject_length = orig_str.size();
-    
+
     // Check if the string matches the pattern
     int num_matches = pcre2_match(
         m_compiled,
@@ -290,7 +287,7 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
         pcre2_match_data_free(match_data);
         return orig_str;
     }
-    
+
     // Allocate dynamically since lenght depends dynamically on the lenght of input and replace strings.
     // Allocated memory will be freed at the exit from function.
     size_t buffer_length = sizeof(PCRE2_UCHAR) * 4 * (subject_length + num_matches * replace_pattern.size());
@@ -302,7 +299,7 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
         pcre2_match_data_free(match_data);
         return orig_str;
     }
-    
+
     int rc = pcre2_substitute(
         m_compiled,
         (PCRE2_SPTR) orig_str.c_str(), orig_str.size(),
@@ -332,7 +329,7 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
     }
     auto res = std::string(reinterpret_cast<char*>(buffer), buffer_length);
     std::free(buffer);
-    pcre2_match_data_free(match_data); 
+    pcre2_match_data_free(match_data);
     return res;
 }
 
@@ -353,7 +350,7 @@ std::pair<size_t, size_t> PCRE2Wrapper::match(const std::string& str, size_t cur
     );
 
     if (match_result < 0) {
-        pcre2_match_data_free(match_data); 
+        pcre2_match_data_free(match_data);
         return {SIZE_MAX, SIZE_MAX};
     }
 
@@ -363,7 +360,7 @@ std::pair<size_t, size_t> PCRE2Wrapper::match(const std::string& str, size_t cur
     std::pair<size_t, size_t> res = {ovector[0], ovector[1]};
 
     // Free only after copying results from match_data to res;
-    pcre2_match_data_free(match_data); 
+    pcre2_match_data_free(match_data);
     return res;
 }