From f8f3ae93cc2636550d28cc466e1cb8f95c9a6fb7 Mon Sep 17 00:00:00 2001 From: Wenbing Li <10278425+wenbingl@users.noreply.github.com> Date: Wed, 15 Jan 2025 16:34:16 -0800 Subject: [PATCH] Support fast tokenizer type in JSON tokenizer (#876) * Support fast tokenizer type too * more log * Update tokenizer dictionary entry name --- .pipelines/ci.yml | 2 +- operators/tokenizer/tokenizer_jsconfig.hpp | 12 +++++++++--- operators/tokenizer/tokenizer_op_impl.hpp | 2 +- shared/api/tokenizer_impl.cc | 2 +- test/test_pp_api.py | 9 +++++++++ 5 files changed, 21 insertions(+), 6 deletions(-) diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml index c78433cf..8c375766 100644 --- a/.pipelines/ci.yml +++ b/.pipelines/ci.yml @@ -740,7 +740,7 @@ stages: steps: - script: | cd $(Build.BinariesDirectory) - git clone https://github.com/emscripten-core/emsdk + git clone https://github.com/emscripten-core/emsdk --depth 1 --branch 3.1.74 emsdk/emsdk install latest emsdk/emsdk activate latest displayName: Setup emscripten pipeline diff --git a/operators/tokenizer/tokenizer_jsconfig.hpp b/operators/tokenizer/tokenizer_jsconfig.hpp index 75fac5f4..babb929b 100644 --- a/operators/tokenizer/tokenizer_jsconfig.hpp +++ b/operators/tokenizer/tokenizer_jsconfig.hpp @@ -15,7 +15,7 @@ enum class TokenType { }; constexpr std::pair kTokenizerDict[] = { - {"PreTrainedTokenizerFast", TokenType::kBPE}, + {"PreTrainedTokenizer", TokenType::kBPE}, {"CLIPTokenizer", TokenType::kBPE}, {"WhisperTokenizer", TokenType::kBPE}, {"GemmaTokenizer", TokenType::kBPE}, @@ -256,10 +256,16 @@ class TokenJsonConfig final { } static TokenType GetTokenType(const std::string& tok) { - static const std::unordered_map dict { + static const std::unordered_map dict { std::begin(kTokenizerDict), std::end(kTokenizerDict) }; - auto iter = dict.find(tok); + std::string_view tok_class(tok); + auto pos = tok_class.find("Fast"); + if (pos != std::string_view::npos && pos + 4 == tok_class.size()) { + tok_class.remove_suffix(4); + } + + auto iter = dict.find(tok_class); return iter == dict.end() ? TokenType::kUnknown : iter->second; } diff --git a/operators/tokenizer/tokenizer_op_impl.hpp b/operators/tokenizer/tokenizer_op_impl.hpp index 455f88fa..d9f2180e 100644 --- a/operators/tokenizer/tokenizer_op_impl.hpp +++ b/operators/tokenizer/tokenizer_op_impl.hpp @@ -33,7 +33,7 @@ class JsonTokenizerOpKernel { } else if (type == TokenType::kBPE) { tokenizer_ = std::make_unique(); } else { - return OrtxStatus(kOrtxErrorCorruptData, "Unknown tokenizer type"); + return OrtxStatus(kOrtxErrorCorruptData, "Unknown tokenizer type" + cfg.tokenizer_class_); } return std::visit([&](auto& ptr) { return ptr->Load(cfg); }, tokenizer_); diff --git a/shared/api/tokenizer_impl.cc b/shared/api/tokenizer_impl.cc index fe5ad527..fe7d6440 100644 --- a/shared/api/tokenizer_impl.cc +++ b/shared/api/tokenizer_impl.cc @@ -67,7 +67,7 @@ OrtxStatus TokenizerImpl::LoadTokenizer(const OrtxTokenizerBlob* blob) { return status; } - return OrtxStatus(kOrtxErrorNotImplemented, "Unsupported tokenizer class"); + return OrtxStatus(kOrtxErrorNotImplemented, "Unsupported tokenizer class: " + tok_config_->tokenizer_class_); } OrtxStatus TokenizerImpl::Load(const OrtxTokenizerBlob& blob) { diff --git a/test/test_pp_api.py b/test/test_pp_api.py index de2ca98e..7abca802 100644 --- a/test/test_pp_api.py +++ b/test/test_pp_api.py @@ -140,6 +140,15 @@ def test_Qwen_QVQ_tokenizer(self): ortx_inputs = tokenizer.tokenize(test_sentence) np.testing.assert_array_equal(ortx_inputs, inputs) + def test_Phi4_tokenizer(self): + model_id = "/g/phi-x-12202024" + test_sentence = [self.tokenizer_test_sentence] + hf_enc = AutoTokenizer.from_pretrained(model_id) + inputs = hf_enc(test_sentence)["input_ids"] + tokenizer = pp_api.Tokenizer(model_id) + ortx_inputs = tokenizer.tokenize(test_sentence) + np.testing.assert_array_equal(ortx_inputs, inputs) + if __name__ == "__main__": unittest.main()