Skip to content

Commit 3dde884

Browse files
authored
fix memory issues (#257)
1 parent c990f9c commit 3dde884

File tree

4 files changed

+22
-16
lines changed

4 files changed

+22
-16
lines changed

src/bpe_tokenizer.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -253,19 +253,19 @@ BPETokenizerImpl::BPETokenizerImpl(
253253
Vocab new_vocab = vocab;
254254

255255
for (size_t i = 0; i < merges.size(); i++) {
256-
auto pair = merges.at(i);
256+
auto& pair = merges.at(i);
257257
auto id_pair = std::make_pair(vocab.at(pair.first), vocab.at(pair.second));
258258
new_merges[id_pair] = {i, vocab.at(pair.first + pair.second)};
259259
new_vocab.erase(pair.first + pair.second);
260260
}
261261

262-
this->m_vocab = new_vocab;
263-
this->m_merges = new_merges;
262+
m_vocab = std::move(new_vocab);
263+
m_merges = std::move(new_merges);
264264

265265
m_trie = std::make_unique<Trie>();
266-
for(const auto& word: new_vocab) {
266+
for(const auto& word: m_vocab) {
267267
const auto token = std::vector<unsigned char>(word.first.begin(), word.first.end());
268268
m_trie->add(token, word.second);
269269
}
270270
m_cache.reserve(cache_capacity);
271-
}
271+
}

src/regex_split.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ class RegexSplit : public ov::op::Op {
6969
mutable std::shared_ptr<PCRE2Wrapper> m_search_pattern_pcre2;
7070
mutable std::shared_ptr<std::set<std::string>> m_skip_tokens;
7171
mutable std::string m_behaviour = "remove";
72-
mutable SplitMode m_split_mode;
72+
mutable SplitMode m_split_mode = SplitMode::REMOVED;
7373
bool m_invert = false;
7474
int m_max_splits = -1;
7575

src/utf8_validate.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,10 @@ bool UTF8Validate::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
3939
// UTF-8 code points should not intersect:
4040
// if 2 byte object has code point < 0x80 then it's not valid 2 byte utf-8,
4141
// even if it has a valid bit mask.
42-
const uint64_t code_point_starts[4] = {0x0, 0x80, 0x800, 0x10000};
43-
uint64_t utf_code_point;
44-
size_t bytes_to_consume; // Number of additional 0b10xxxxxx bytes to consume to produce a valid UTF8 symbol.
45-
size_t num_bytes;
42+
const uint32_t code_point_starts[4] = {0x0, 0x80, 0x800, 0x10000};
43+
uint32_t utf_code_point;
44+
uint32_t bytes_to_consume; // Number of additional 0b10xxxxxx bytes to consume to produce a valid UTF8 symbol.
45+
uint32_t num_bytes;
4646

4747
size_t out_idx = begins[0];
4848
for (size_t i = 0; i < begins_shape[0]; i++) {

src/utils.cpp

+12-6
Original file line numberDiff line numberDiff line change
@@ -254,11 +254,6 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
254254
pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(m_compiled, NULL);
255255
PCRE2_SIZE subject_length = orig_str.size();
256256

257-
// Usually found pattern is replaced by shorter string, but set 3 times more space for safety.
258-
// Allocate dynamically since lenght depends dynamically on the lenght of input string.
259-
// Allocated memory will be freed at the exit from function.
260-
auto buffer = (PCRE2_UCHAR*) std::malloc(sizeof(PCRE2_UCHAR) * subject_length * 3);
261-
262257
// Check if the string matches the pattern
263258
int match_result = pcre2_match(
264259
m_compiled,
@@ -272,7 +267,17 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
272267
pcre2_match_data_free(match_data);
273268
return orig_str;
274269
}
275-
270+
271+
// Usually found pattern is replaced by shorter string, but set 3 times more space for safety.
272+
// Allocate dynamically since lenght depends dynamically on the lenght of input string.
273+
// Allocated memory will be freed at the exit from function.
274+
auto buffer = (PCRE2_UCHAR*) std::malloc(sizeof(PCRE2_UCHAR) * subject_length * 3);
275+
if (buffer == nullptr) {
276+
std::cerr << "Memory allocation failed" << std::endl;
277+
pcre2_match_data_free(match_data);
278+
return orig_str;
279+
}
280+
276281
int rc = pcre2_substitute(
277282
m_compiled,
278283
(PCRE2_SPTR) orig_str.c_str(), orig_str.size(),
@@ -292,6 +297,7 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
292297
std::cerr << "PCRE2 substitution failed with error code " << rc << std::endl;
293298
}
294299
pcre2_match_data_free(match_data);
300+
std::free(buffer);
295301
return orig_str;
296302
}
297303
auto res = std::string(reinterpret_cast<char*>(buffer), subject_length);

0 commit comments

Comments
 (0)