Skip to content

Commit fb0157c

Browse files
pavel-esirapaniukov
andauthoredJul 24, 2024
Гse PCRE2 fallback for RegexNormalization'
* add pcre2 * use string with length instead of ZERO_TERMINATION * add Licence to thirt party * set m_search_pattern_re to nullptr if PCRE2 is used * remove unnecessary const char* * fix typo in m_replace_pattern * Update Del Control Chars Regex * Update Pass Rate * fix win build * revert added_tests in conftest.py * revert Readme & pass rates * update README & pass_rates * do not log errors * add comment on malloc --------- Co-authored-by: Artur Paniukov <chgk1101@gmail.com>
1 parent a0d8203 commit fb0157c

14 files changed

+2446
-2221
lines changed
 

‎.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
build-*/

‎README.md

+17-17
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
433433
</tr>
434434
<tr>
435435
<td >SentencePiece</td>
436-
<td >80.53</td>
436+
<td >80.49</td>
437437
<td >4762</td>
438438
</tr>
439439
<tr>
@@ -443,7 +443,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
443443
</tr>
444444
<tr>
445445
<td >WordPiece</td>
446-
<td >91.48</td>
446+
<td >99.10</td>
447447
<td >1327</td>
448448
</tr>
449449
</tbody>
@@ -728,13 +728,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
728728
<tr>
729729
<td >SentencePiece</td>
730730
<td >xlnet-base-cased</td>
731-
<td >65.69</td>
731+
<td >65.27</td>
732732
<td >239</td>
733733
</tr>
734734
<tr>
735735
<td >SentencePiece</td>
736736
<td >xlnet-base-cased_slow</td>
737-
<td >61.43</td>
737+
<td >60.99</td>
738738
<td >223</td>
739739
</tr>
740740
<tr>
@@ -758,79 +758,79 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
758758
<tr>
759759
<td >WordPiece</td>
760760
<td >ProsusAI/finbert</td>
761-
<td >91.59</td>
761+
<td >100.00</td>
762762
<td >107</td>
763763
</tr>
764764
<tr>
765765
<td >WordPiece</td>
766766
<td >bert-base-multilingual-cased</td>
767-
<td >91.59</td>
767+
<td >100.00</td>
768768
<td >107</td>
769769
</tr>
770770
<tr>
771771
<td >WordPiece</td>
772772
<td >bert-base-uncased</td>
773-
<td >91.59</td>
773+
<td >100.00</td>
774774
<td >107</td>
775775
</tr>
776776
<tr>
777777
<td >WordPiece</td>
778778
<td >cointegrated/rubert-tiny2</td>
779-
<td >91.59</td>
779+
<td >100.00</td>
780780
<td >107</td>
781781
</tr>
782782
<tr>
783783
<td >WordPiece</td>
784784
<td >distilbert-base-uncased-finetuned-sst-2-english</td>
785-
<td >91.59</td>
785+
<td >100.00</td>
786786
<td >107</td>
787787
</tr>
788788
<tr>
789789
<td >WordPiece</td>
790790
<td >google/electra-base-discriminator</td>
791-
<td >91.59</td>
791+
<td >100.00</td>
792792
<td >107</td>
793793
</tr>
794794
<tr>
795795
<td >WordPiece</td>
796796
<td >google/mobilebert-uncased</td>
797-
<td >94.51</td>
797+
<td >100.00</td>
798798
<td >91</td>
799799
</tr>
800800
<tr>
801801
<td >WordPiece</td>
802802
<td >jhgan/ko-sbert-sts</td>
803-
<td >91.59</td>
803+
<td >100.00</td>
804804
<td >107</td>
805805
</tr>
806806
<tr>
807807
<td >WordPiece</td>
808808
<td >prajjwal1/bert-mini</td>
809-
<td >94.51</td>
809+
<td >100.00</td>
810810
<td >91</td>
811811
</tr>
812812
<tr>
813813
<td >WordPiece</td>
814814
<td >rajiv003/ernie-finetuned-qqp</td>
815-
<td >94.51</td>
815+
<td >100.00</td>
816816
<td >91</td>
817817
</tr>
818818
<tr>
819819
<td >WordPiece</td>
820820
<td >rasa/LaBSE</td>
821-
<td >80.37</td>
821+
<td >88.79</td>
822822
<td >107</td>
823823
</tr>
824824
<tr>
825825
<td >WordPiece</td>
826826
<td >sentence-transformers/all-MiniLM-L6-v2</td>
827-
<td >91.59</td>
827+
<td >100.00</td>
828828
<td >107</td>
829829
</tr>
830830
<tr>
831831
<td >WordPiece</td>
832832
<td >squeezebert/squeezebert-uncased</td>
833-
<td >94.51</td>
833+
<td >100.00</td>
834834
<td >91</td>
835835
</tr>
836836
</tbody>

‎python/openvino_tokenizers/hf_parser.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,7 @@ def parse_bert_normalizer(normalizer_dict: Dict[str, Any]) -> List[Normalization
7777
steps: List[NormalizationStep] = []
7878

7979
if normalizer_dict["clean_text"] is True:
80-
pass
81-
# TODO: this regex is not supported by re2, skip it until broader syntax support
82-
# steps.append(RegexNormalizationStep.del_control_chars_regex())
80+
steps.append(RegexNormalizationStep.del_control_chars_regex())
8381

8482
# https://github.com/huggingface/tokenizers/blob/8c9cfb0b689bce00b615b9557a9a767f286d7a33/tokenizers/src/normalizers/bert.rs#L127
8583
if normalizer_dict.get("strip_accents") or normalizer_dict["lowercase"]:

‎python/openvino_tokenizers/tokenizer_pipeline.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,9 @@ def prepend_regex(cls, string: str) -> "RegexNormalizationStep":
146146

147147
@classmethod
148148
def del_control_chars_regex(cls) -> "RegexNormalizationStep":
149-
# https://github.com/huggingface/tokenizers/blob/8c9cfb0b689bce00b615b9557a9a767f286d7a33/tokenizers/src/normalizers/bert.rs#L17
150149
return cls(
151-
regex_search_pattern=r"((?=[^\n\t\r])\p{Cc})|((?=[^\n\t\r])\p{Cf})",
152-
replace_term=" ",
150+
regex_search_pattern=r"([\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F])", # exclude \n\t\r
151+
replace_term="",
153152
)
154153

155154
@classmethod

‎src/CMakeLists.txt

+13
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,19 @@ set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_OPTIONS "${extra_flags}"
278278
target_compile_definitions(${TARGET_NAME} PRIVATE IMPLEMENT_OPENVINO_EXTENSION_API)
279279
target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
280280

281+
# Fetch PCRE2
282+
include(FetchContent)
283+
FetchContent_Declare(
284+
PCRE2
285+
URL https://github.com/PCRE2Project/pcre2/archive/refs/tags/pcre2-10.44.zip
286+
URL_HASH SHA256=2d87bd1700bd1993ddea7c56aad2b0373ac2b3d52d9cc78842a6d061ffaf0925
287+
)
288+
FetchContent_MakeAvailable(PCRE2)
289+
290+
target_include_directories(${TARGET_NAME} PRIVATE ${PCRE2_BINARY_DIR})
291+
target_link_libraries(${TARGET_NAME} PRIVATE pcre2-8)
292+
target_compile_definitions(${TARGET_NAME} PRIVATE PCRE2_CODE_UNIT_WIDTH=8)
293+
281294
if(OpenVINO_Frontend_TensorFlow_FOUND)
282295
target_link_libraries(${TARGET_NAME} PRIVATE openvino::frontend::tensorflow)
283296
target_compile_definitions(${TARGET_NAME} PRIVATE OpenVINO_Frontend_TensorFlow_FOUND)

‎src/regex_normalization.cpp

+71-27
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,10 @@
22
// SPDX-License-Identifier: Apache-2.0
33
//
44

5-
6-
75
#include "regex_normalization.hpp"
86
#include "utils.hpp"
97

8+
109
using namespace ov;
1110

1211

@@ -19,32 +18,61 @@ m_global_replace(global_replace) {
1918
auto replace_pattern_const = as_type_ptr<Constant>(arguments[4].get_node_shared_ptr());
2019
auto search_pattern_buf = static_cast<const char*>(search_pattern_const->get_data_ptr());
2120
auto replace_pattern_buf = static_cast<const char*>(replace_pattern_const->get_data_ptr());
22-
auto search_pattern = absl::string_view((const char*)search_pattern_buf, search_pattern_const->get_byte_size());
23-
m_replace_pattern = absl::string_view((const char*)replace_pattern_buf, replace_pattern_const->get_byte_size());
24-
m_search_pattern_re = std::make_shared<re2::RE2>(search_pattern);
21+
auto search_pattern = absl::string_view(search_pattern_buf, search_pattern_const->get_byte_size());
22+
m_replace_pattern = absl::string_view(replace_pattern_buf, replace_pattern_const->get_byte_size());
23+
24+
auto options = re2::RE2::Options();
25+
options.set_log_errors(false);
26+
m_search_pattern_re = std::make_shared<re2::RE2>(search_pattern, options);
27+
28+
if (m_search_pattern_re->NumberOfCapturingGroups() == -1) {
29+
// If RE2 was unable to process pattern.
30+
m_search_pattern_pcre2 = std::make_shared<PCRE2Wrapper>(search_pattern);
31+
m_search_pattern_re = nullptr;
32+
}
33+
2534
constructor_validate_and_infer_types();
2635
}
2736

2837

2938
RegexNormalization::RegexNormalization(
3039
const ov::OutputVector& arguments,
3140
const std::shared_ptr<re2::RE2>& search_pattern_re,
41+
const std::shared_ptr<PCRE2Wrapper>& search_pattern_pcre2,
3242
const absl::string_view replace_pattern,
3343
bool global_replace
3444
) : ov::op::Op(arguments),
3545
m_search_pattern_re(search_pattern_re),
46+
m_search_pattern_pcre2(search_pattern_pcre2),
3647
m_replace_pattern(replace_pattern),
3748
m_global_replace(global_replace) {
3849

39-
if (m_search_pattern_re == nullptr) {
40-
auto search_pattern_const = as_type_ptr<Constant>(arguments[3].get_node_shared_ptr());
41-
auto replace_pattern_const = as_type_ptr<Constant>(arguments[4].get_node_shared_ptr());
42-
auto search_pattern_buf = static_cast<const char*>(search_pattern_const->get_data_ptr());
43-
auto replace_pattern_buf = static_cast<const char*>(replace_pattern_const->get_data_ptr());
44-
auto search_pattern = absl::string_view((const char*)search_pattern_buf, search_pattern_const->get_byte_size());
45-
m_replace_pattern = absl::string_view((const char*)replace_pattern_buf, replace_pattern_const->get_byte_size());
46-
m_search_pattern_re = std::make_shared<re2::RE2>(search_pattern);
50+
auto search_pattern_const = as_type_ptr<Constant>(arguments[3].get_node_shared_ptr());
51+
auto replace_pattern_const = as_type_ptr<Constant>(arguments[4].get_node_shared_ptr());
52+
const char* search_pattern_buf;
53+
const char* replace_pattern_buf;
54+
absl::string_view search_pattern;
55+
56+
if (m_search_pattern_re == nullptr || m_search_pattern_pcre2 == nullptr) {
57+
search_pattern_buf = static_cast<const char*>(search_pattern_const->get_data_ptr());
58+
replace_pattern_buf = static_cast<const char*>(replace_pattern_const->get_data_ptr());
59+
search_pattern = absl::string_view(search_pattern_buf, search_pattern_const->get_byte_size());
60+
m_replace_pattern = absl::string_view(replace_pattern_buf, replace_pattern_const->get_byte_size());
4761
};
62+
63+
auto options = re2::RE2::Options();
64+
options.set_log_errors(false);
65+
if (m_search_pattern_re == nullptr) {
66+
auto options = re2::RE2::Options();
67+
options.set_log_errors(false);
68+
m_search_pattern_re = std::make_shared<re2::RE2>(search_pattern, options);
69+
}
70+
71+
if (m_search_pattern_re->NumberOfCapturingGroups() == -1 && m_search_pattern_pcre2 == nullptr) {
72+
m_search_pattern_pcre2 = std::make_shared<PCRE2Wrapper>(search_pattern);
73+
m_search_pattern_re = nullptr;
74+
}
75+
4876
constructor_validate_and_infer_types();
4977
}
5078

@@ -58,24 +86,40 @@ void RegexNormalization::validate_and_infer_types() {
5886

5987

6088
bool RegexNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
61-
if (m_search_pattern_re == nullptr) {
62-
auto search_pattern = absl::string_view(inputs[3].data<const char>(), inputs[3].get_size());
89+
absl::string_view search_pattern;
90+
if (m_search_pattern_re == nullptr || m_search_pattern_pcre2 == nullptr) {
91+
search_pattern = absl::string_view(inputs[3].data<const char>(), inputs[3].get_size());
6392
m_replace_pattern = absl::string_view(inputs[4].data<const char>(), inputs[4].get_size());
64-
m_search_pattern_re = std::make_shared<re2::RE2>(search_pattern);
65-
};
93+
}
94+
95+
if (m_search_pattern_re == nullptr && m_search_pattern_pcre2 == nullptr) {
96+
auto options = re2::RE2::Options();
97+
options.set_log_errors(false);
98+
m_search_pattern_re = std::make_shared<re2::RE2>(search_pattern, options);
99+
}
100+
101+
if ((m_search_pattern_re == nullptr) || (m_search_pattern_re->NumberOfCapturingGroups() == -1 && m_search_pattern_pcre2 == nullptr)) {
102+
m_search_pattern_pcre2 = std::make_shared<PCRE2Wrapper>(search_pattern);
103+
m_search_pattern_re = nullptr;
104+
}
105+
66106
return evaluate_normalization_helper(
67107
outputs, inputs,
68-
[this](const std::string& str) {
69-
// FIXME: if regex is not valid re2, return string without changing (use another regex engine)
70-
if (m_search_pattern_re->NumberOfCapturingGroups() == -1)
71-
return str;
72-
108+
[this](const std::string& str) -> std::string {
73109
std::string result = str;
74-
if (m_global_replace) {
75-
re2::RE2::GlobalReplace(&result, *m_search_pattern_re, m_replace_pattern);
110+
111+
// Use RE2 where possible, and fallback to PCRE2 if RE2 was not able to process.
112+
if (m_search_pattern_re) {
113+
if (m_global_replace) {
114+
re2::RE2::GlobalReplace(&result, *m_search_pattern_re, m_replace_pattern);
115+
} else {
116+
re2::RE2::Replace(&result, *m_search_pattern_re, m_replace_pattern);
117+
};
118+
return result;
119+
} else if (m_search_pattern_pcre2) {
120+
return m_search_pattern_pcre2->substitute(result, m_replace_pattern, m_global_replace);
76121
} else {
77-
re2::RE2::Replace(&result, *m_search_pattern_re, m_replace_pattern);
78-
};
79-
return result;
122+
return result;
123+
}
80124
});
81125
}

‎src/regex_normalization.hpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@
55
#pragma once
66

77
#include "absl/strings/string_view.h"
8+
#include "utils.hpp"
89

910
#include <openvino/op/op.hpp>
1011
#include "openvino/opsets/opset13.hpp"
1112
#include "fast_tokenizer/normalizers/normalizers.h"
13+
#include <pcre2.h>
1214

1315
using namespace ov;
1416
using namespace ov::opset13;
@@ -25,14 +27,15 @@ class RegexNormalization : public ov::op::Op {
2527
RegexNormalization(
2628
const ov::OutputVector& arguments,
2729
const std::shared_ptr<re2::RE2>& search_pattern_re,
30+
const std::shared_ptr<PCRE2Wrapper>& search_pattern_rcre2,
2831
const absl::string_view replace_pattern,
2932
bool global_replace = true
3033
);
3134

3235
void validate_and_infer_types() override;
3336

3437
std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& inputs) const override {
35-
return std::make_shared<RegexNormalization>(inputs, m_search_pattern_re, m_replace_pattern, m_global_replace);
38+
return std::make_shared<RegexNormalization>(inputs, m_search_pattern_re, m_search_pattern_pcre2, m_replace_pattern, m_global_replace);
3639
}
3740

3841
bool visit_attributes(ov::AttributeVisitor& visitor) override {
@@ -47,6 +50,7 @@ class RegexNormalization : public ov::op::Op {
4750
}
4851
private:
4952
mutable std::shared_ptr<re2::RE2> m_search_pattern_re;
53+
mutable std::shared_ptr<PCRE2Wrapper> m_search_pattern_pcre2;
5054
mutable absl::string_view m_replace_pattern;
5155
bool m_global_replace = true;
5256
};

0 commit comments

Comments
 (0)