Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use const pointers for const tensors #450

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions src/bytes_to_chars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,10 +289,7 @@ bool BytesToChars::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
auto chars = inputs[4].data<const uint8_t>();

const bool has_skips = inputs.size() == 6;
bool * skips;
if (has_skips) {
skips = inputs[5].data<bool>();
};
auto skips = has_skips ? inputs[5].data<bool>() : nullptr;

// Set output shapes
outputs[0] = inputs[0];
Expand Down Expand Up @@ -340,4 +337,3 @@ bool BytesToChars::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
outputs[4].set_shape({char_pointer});
return true;
}

29 changes: 13 additions & 16 deletions src/regex_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ const std::map<std::string, RegexSplit::SplitMode> split_modes_map = {

void RegexSplit::compile_pattern_if_necessary(std::string split_pattern) const {
m_split_mode = split_modes_map.at(m_behaviour);

if (m_search_pattern_pcre2) {
return;
}

if (m_behaviour == "contiguous" && split_pattern[split_pattern.length() - 1] != '+') {
std::stringstream tmp_stream;
tmp_stream << "(" << split_pattern << ")+";
Expand Down Expand Up @@ -149,7 +149,7 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
std::lock_guard<std::mutex> lock(m_mutex);
compile_pattern_if_necessary(split_pattern);
}

auto get_next_match = [this](const std::string& str, size_t curr_start) -> std::optional<std::pair<size_t, size_t>>{
auto match = this->m_search_pattern_pcre2->match(str, curr_start);
if (match.first != SIZE_MAX && match.first != match.second) {
Expand Down Expand Up @@ -186,15 +186,15 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
auto ends = inputs[3].data<const int32_t>();
auto chars = inputs[4].data<const uint8_t>();
const size_t num_rows = inputs[0].get_size();
bool * skips;
bool init_skips = false;
const bool *skips;
Tensor skips_t, new_skips_t;

if (has_skips) {
skips = inputs[5].data<bool>();
outputs[5].set_shape(Shape{max_shape});
} else {
skips = new bool[num_rows];
init_skips = true;
std::fill(skips, skips + num_rows, false);
skips_t = Tensor(element::boolean, Shape{num_rows});
skips = std::fill_n(skips_t.data<bool>(), num_rows, false) - num_rows;
};

outputs[0].set_shape(inputs[0].get_shape());
Expand All @@ -213,7 +213,8 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
if (has_skips) {
new_skips = outputs[5].data<bool>();
} else {
new_skips = new bool[max_shape];
new_skips_t = Tensor(element::boolean, Shape{max_shape});
new_skips = new_skips_t.data<bool>();
};
int32_t ragged_offset = 0;

Expand All @@ -234,7 +235,7 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
} else {
size_t start = 0;
uint32_t num_splits = 0;

size_t last_begin = -1;
auto add_split = [&](int begin, int end, bool invert) {
switch (m_split_mode) {
Expand Down Expand Up @@ -274,14 +275,14 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
end = str.length();
};
new_ends[ragged_offset++] = begins[ragged_col] + end;

++num_splits;
};

std::optional<std::pair<size_t, size_t>> match;
while ((match = get_next_match(str, start)) != std::nullopt) {
auto [curr_start, curr_end] = *match;

if (curr_start != start) {
if (has_skips) {
new_skips[ragged_offset] = false;
Expand Down Expand Up @@ -314,10 +315,6 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
if (has_skips) {
outputs[5].set_shape({size_t(ragged_offset)});
};
if (init_skips) {
delete[] skips;
delete[] new_skips;
};

return true;
}
13 changes: 5 additions & 8 deletions src/special_tokens_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,16 +74,16 @@ bool SpecialTokensSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVec
const size_t batch_size = inputs[0].get_size();
const size_t num_chars = inputs[4].get_size();

bool * skips;
bool init_skips = false;
Tensor skips_alternative;
const bool *skips;
if (has_skips) {
skips = inputs[5].data<bool>();
outputs[5].set_shape(Shape{num_chars});
} else {
outputs[5].set_shape(Shape{num_chars});
skips = new bool[batch_size];
init_skips = true;
std::fill(skips, skips + batch_size, false);
skips_alternative = Tensor(element::boolean, Shape{batch_size});
skips = std::fill_n(skips_alternative.data<bool>(), batch_size, false) -
batch_size;
};

outputs[0].set_shape(inputs[0].get_shape());
Expand Down Expand Up @@ -145,8 +145,5 @@ bool SpecialTokensSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVec
outputs[3].set_shape({size_t(ragged_offset)});
outputs[5].set_shape({size_t(ragged_offset)});

if (init_skips) {
delete[] skips;
};
return true;
}
14 changes: 7 additions & 7 deletions src/utf8_validate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,26 @@ void UTF8Validate::validate_and_infer_types() {
bool UTF8Validate::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
auto begins = inputs[0].data<int32_t>();
auto ends = inputs[1].data<int32_t>();
uint8_t* bytes = inputs[2].data<uint8_t>();
auto bytes = inputs[2].data<uint8_t>();
auto begins_shape = inputs[0].get_shape();
auto chars_shape = inputs[2].get_shape();

const unsigned char replacement_symbol[] = {0xEF, 0xBF, 0xBD}; // UTF-8 encoding for "�"
outputs[0].set_shape(begins_shape);
outputs[1].set_shape(begins_shape);

// One byte can be replaced by 3 bytes at most,
// therefore need to allocate more space
size_t last_axis = chars_shape.size() - 1;
chars_shape[last_axis] = chars_shape[last_axis] * 3;
outputs[2].set_shape(chars_shape);

auto out_begins = outputs[0].data<int32_t>();
auto out_ends = outputs[1].data<int32_t>();
auto out_bytes = outputs[2].data<uint8_t>();

// UTF-8 code points should not intersect:
// if 2 byte object has code point < 0x80 then it's not valid 2 byte utf-8,
// if 2 byte object has code point < 0x80 then it's not valid 2 byte utf-8,
// even if it has a valid bit mask.
const uint32_t code_point_starts[4] = {0x0, 0x80, 0x800, 0x10000};
uint32_t utf_code_point;
Expand All @@ -49,7 +49,7 @@ bool UTF8Validate::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
// Flag indicating whether UTF8 symbol is complete: true means it's complete, false means we expect continuation.
// bool new_symbol_flag = true;
bytes_to_consume = 0;

out_begins[i] = out_idx;
for (size_t j = begins[i]; j < ends[i]; j += 1) {
// Beggining of the symbol.
Expand All @@ -70,7 +70,7 @@ bool UTF8Validate::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
utf_code_point = (0b1111 & bytes[j]) << 6 * bytes_to_consume;
continue;
} else if (!bytes_to_consume && bytes[j] >> 3 == 0b11110) {
num_bytes = 4;
num_bytes = 4;
bytes_to_consume = 3;
utf_code_point = (0b111 & bytes[j]) << 6 * bytes_to_consume;
continue;
Expand Down
23 changes: 10 additions & 13 deletions src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ void check_string_input(const Node* node, size_t input_index) {
void check_string_scalar_input(const Node* node, size_t input_index) {
auto shape = node->get_input_partial_shape(input_index);
auto element_type = node->get_input_element_type(input_index);
#if false && USE_STRING_TENSORS

#if false && USE_STRING_TENSORS
// This block is not used when we convert ops to decomposed representation (and we really do)
OPENVINO_ASSERT(
(element_type == element::dynamic || element_type == element::string) &&
Expand Down Expand Up @@ -117,7 +117,7 @@ void unpack_strings_to_tensors (const std::string* strings, const Shape shape, o
}

void override_parameter (std::shared_ptr<ov::Node> node, element::Type type, const PartialShape& shape) {
if (auto parameter = std::dynamic_pointer_cast<Parameter>(node)) {
if (auto parameter = std::dynamic_pointer_cast<Parameter>(node)) {
// TODO: Apply this change conditionally based on real Parameter value
if (getenv_bool("OPENVINO_TOKENIZERS_PRINT_DEBUG_INFO", false)) {
std::cerr << "Overriding Parameter element_type to " << type << " and shape " << shape << "\n";
Expand Down Expand Up @@ -170,10 +170,7 @@ bool evaluate_normalization_helper (ov::TensorVector& outputs, const ov::TensorV
auto ends = inputs[1].data<const int32_t>();
auto chars = inputs[2].data<const uint8_t>();

bool * skips;
if (has_skips) {
skips = inputs[3].data<bool>();
};
auto skips = has_skips ? inputs[3].data<bool>() : nullptr;

// Set output shapes
outputs[0].set_shape(inputs[0].get_shape());
Expand Down Expand Up @@ -276,7 +273,7 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
}
pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(m_compiled, NULL);
PCRE2_SIZE subject_length = orig_str.size();

// Check if the string matches the pattern
int num_matches = pcre2_match(
m_compiled,
Expand All @@ -290,7 +287,7 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
pcre2_match_data_free(match_data);
return orig_str;
}

// Allocate dynamically since lenght depends dynamically on the lenght of input and replace strings.
// Allocated memory will be freed at the exit from function.
size_t buffer_length = sizeof(PCRE2_UCHAR) * 4 * (subject_length + num_matches * replace_pattern.size());
Expand All @@ -302,7 +299,7 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
pcre2_match_data_free(match_data);
return orig_str;
}

int rc = pcre2_substitute(
m_compiled,
(PCRE2_SPTR) orig_str.c_str(), orig_str.size(),
Expand Down Expand Up @@ -332,7 +329,7 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
}
auto res = std::string(reinterpret_cast<char*>(buffer), buffer_length);
std::free(buffer);
pcre2_match_data_free(match_data);
pcre2_match_data_free(match_data);
return res;
}

Expand All @@ -353,7 +350,7 @@ std::pair<size_t, size_t> PCRE2Wrapper::match(const std::string& str, size_t cur
);

if (match_result < 0) {
pcre2_match_data_free(match_data);
pcre2_match_data_free(match_data);
return {SIZE_MAX, SIZE_MAX};
}

Expand All @@ -363,7 +360,7 @@ std::pair<size_t, size_t> PCRE2Wrapper::match(const std::string& str, size_t cur
std::pair<size_t, size_t> res = {ovector[0], ovector[1]};

// Free only after copying results from match_data to res;
pcre2_match_data_free(match_data);
pcre2_match_data_free(match_data);
return res;
}

Expand Down
Loading