Skip to content

Commit

Permalink
a general regex match algorithm to fix all related issues
Browse files Browse the repository at this point in the history
  • Loading branch information
wenbingl committed Jan 11, 2025
1 parent 641930d commit 2a4fe1a
Show file tree
Hide file tree
Showing 5 changed files with 1,108 additions and 722 deletions.
13 changes: 8 additions & 5 deletions operators/tokenizer/bpe_kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,10 @@ std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,
// Parse input
auto special_token_split_res = bbpe_tokenizer_->SplitByAddedAndSpecial(input);
bpe::PreTokenizerWithRegEx reg_splitter;
// NOTE: the pattern was already validated on loading json file.
// safe to ingore the return value here.
auto status = reg_splitter.Compile(bbpe_tokenizer_->GetPreTokenizerRegex(ModelName()));
assert(status.IsOk());

for (auto& seg_id : special_token_split_res) {
if (static_cast<int64_t>(res.size()) >= max_length) break;
Expand All @@ -287,13 +291,12 @@ std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,
}

while (static_cast<int64_t>(res.size()) < max_length) {
std::string regex_expr = bbpe_tokenizer_->GetPreTokenizerRegex(ModelName());
auto [b, tok] = reg_splitter.GetNextToken(regex_expr);

if (!b) break;
std::u32string_view tok = reg_splitter.GetNextToken();
if (tok.empty()) {
break;
}

std::string utf8_token = std::string(ustring(tok));

size_t space_dif = 0;
if (compute_offset_mapping) {
// Handle special case for offset mapping
Expand Down
6 changes: 6 additions & 0 deletions operators/tokenizer/bpe_tokenizer_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ class BpeModel {
ORTX_JSON_RETURN_IF_NULL(&node, "pattern", iter_pattern);
ORTX_JSON_RETURN_IF_NULL(iter_pattern, "Regex", regex_str);
pre_tokenizer_regex_ = regex_str->get<std::string>();
// Validate the regex pattern
bpe::PreTokenizerWithRegEx pre_tokenizer;
auto status = pre_tokenizer.Compile(pre_tokenizer_regex_);
if (!status.IsOk()) {
return status;
}
} else {
if (pre_tokenizer_types_.count(pre_type) == 0) {
return {kOrtxErrorNotImplemented, "Unsupported pretokenizer type!"};
Expand Down
Loading

0 comments on commit 2a4fe1a

Please sign in to comment.