Skip to content

Commit

Permalink
Merge branch 'main' into sayanshaw/python-decoder-api
Browse files Browse the repository at this point in the history
  • Loading branch information
sayanshaw24 authored Jan 13, 2025
2 parents 24e993b + c8bb35d commit cb1a92c
Show file tree
Hide file tree
Showing 5 changed files with 275 additions and 803 deletions.
13 changes: 8 additions & 5 deletions operators/tokenizer/bpe_kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,10 @@ std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,
// Parse input
auto special_token_split_res = bbpe_tokenizer_->SplitByAddedAndSpecial(input);
bpe::PreTokenizerWithRegEx reg_splitter;
// NOTE: the pattern was already validated on loading json file.
// safe to ingore the return value here.
auto status = reg_splitter.Compile(bbpe_tokenizer_->GetPreTokenizerRegex(ModelName()));
assert(status.IsOk());

for (auto& seg_id : special_token_split_res) {
if (static_cast<int64_t>(res.size()) >= max_length) break;
Expand All @@ -287,13 +291,12 @@ std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,
}

while (static_cast<int64_t>(res.size()) < max_length) {
std::string regex_expr = bbpe_tokenizer_->GetPreTokenizerRegex(ModelName());
auto [b, tok] = reg_splitter.GetNextToken(regex_expr);

if (!b) break;
std::u32string_view tok = reg_splitter.GetNextToken();
if (tok.empty()) {
break;
}

std::string utf8_token = std::string(ustring(tok));

size_t space_dif = 0;
if (compute_offset_mapping) {
// Handle special case for offset mapping
Expand Down
8 changes: 7 additions & 1 deletion operators/tokenizer/bpe_tokenizer_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ class BpeModel {
ORTX_JSON_RETURN_IF_NULL(&node, "pattern", iter_pattern);
ORTX_JSON_RETURN_IF_NULL(iter_pattern, "Regex", regex_str);
pre_tokenizer_regex_ = regex_str->get<std::string>();
// Validate the regex pattern
bpe::PreTokenizerWithRegEx pre_tokenizer;
auto status = pre_tokenizer.Compile(pre_tokenizer_regex_);
if (!status.IsOk()) {
return status;
}
} else {
if (pre_tokenizer_types_.count(pre_type) == 0) {
return {kOrtxErrorNotImplemented, "Unsupported pretokenizer type!"};
Expand Down Expand Up @@ -146,7 +152,7 @@ class BpeModel {
} else {
vocab_map_[line] = id;
}
special_tokens_.Add(std::move(line_32), id);
ORTX_RETURN_IF_ERROR(special_tokens_.Add(std::move(line_32), id));
}
}

Expand Down
Loading

0 comments on commit cb1a92c

Please sign in to comment.