Merge branch 'main' into skottmckay/AddAbilityToBuildDummyNugetPackag…

…eWithNoOps
microsoft · Oct 19, 2023 · 8544e46 · 8544e46
2 parents 80cb4a0 + 4d2930e
commit 8544e46
Show file tree

Hide file tree

Showing 5 changed files with 19 additions and 3 deletions.
diff --git a/operators/tokenizer/bpe_kernels.cc b/operators/tokenizer/bpe_kernels.cc
@@ -148,6 +148,9 @@ std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,
     if (IsUnicodeSpace(str.back())) {
       str.pop_back();
     }
+    // remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned)
+    str.erase(std::remove(str.begin(), str.end(), '\n'), str.end());
+    str.erase(std::remove(str.begin(), str.end(), '\r'), str.end());
     input = str;
   }
 
@@ -196,6 +199,7 @@ std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,
 
     while (static_cast<int64_t>(res.size()) < max_length) {
       auto [b, tok] = regcmp.GetNextToken();
+
       if (!b) break;
 
       std::string utf8_token = std::string(ustring(tok));

diff --git a/operators/tokenizer/bpe_utils.hpp b/operators/tokenizer/bpe_utils.hpp
@@ -101,6 +101,7 @@ class TokenWithRegularExp {
 
  private:
   std::u32string_view TryMatch() {
+
     // python pattern:
     // 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
 

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,6 +1,7 @@
 # include requirements.txt so pip has context to avoid installing incompatible dependencies
 -r requirements.txt
 pytest
+ftfy
 # multiple versions of onnxruntime are supported, but only one can be installed at a time
 protobuf         < 4.0.0
 onnxruntime      >=1.12.0

diff --git a/test/test_autotokenizer.py b/test/test_autotokenizer.py
@@ -3,6 +3,7 @@
 import unittest
 
 import numpy as np
+import ftfy
 from transformers import AutoTokenizer, GPT2Tokenizer
 from onnxruntime_extensions import OrtPyFunction, gen_processing_models, ort_inference, util
 
@@ -49,7 +50,10 @@ def test_t5_tokenizer(self):
 
     def test_roberta_base(self):
         tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions", use_fast=False)
-        text = "Agree. Keep trying, then if your rejected every time. I'm sorry your done."
+        text = """
+               Agree. Keep trying, then if your rejected every time. I'm sorry your done.
+               Testing words with apostrophes such as you're, i'm, don't, etc.
+               """
         ids = tokenizer.encode(text, return_tensors="np")
         m_tok, m_detok = gen_processing_models(tokenizer, pre_kwargs={}, post_kwargs={})
 
@@ -60,7 +64,11 @@ def test_roberta_base(self):
 
     def test_clip_tokenizer(self):
         tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32", use_fast=False)
-        text = "Wow, these models are getting popular."
+        text = """
+               1. Testing long text with multiple lines to check newline handling
+               2. As well as words with apostrophes such as you're, i'm, don't, etc.
+               3. And weird characters such as . , ~ ? ( ) " [ ] ! : - .
+               """
         ids = tokenizer.encode(text, return_tensors="np")
 
         ort_tok = OrtPyFunction.from_model(gen_processing_models(
@@ -71,7 +79,7 @@ def test_clip_tokenizer(self):
 
     def test_gpt2_tokenizer(self):
         tokenizer = GPT2Tokenizer.from_pretrained("Xenova/gpt-4", use_fast=False)
-        text = "Deep learning has come a long way, no?"
+        text = "Testing words with apostrophes such as you're, i'm, don't, etc."
         ids = tokenizer.encode(text, return_tensors="np")
 
         ort_tok = OrtPyFunction.from_model(gen_processing_models(

diff --git a/test/test_cliptok.py b/test/test_cliptok.py
@@ -1,6 +1,7 @@
 import unittest
 import numpy as np
 import onnxruntime as _ort
+import ftfy
 
 from pathlib import Path
 from onnx import helper, onnx_pb as onnx_proto
@@ -105,6 +106,7 @@ def test_tokenizer(self):
         self._run_tokenizer(["Testing multiple      sequences       of spaces"])
         self._run_tokenizer(["      in the beginning and the end.      "])
         self._run_tokenizer([" "])
+        self._run_tokenizer(["Testing words with apostrophes such as you're, i'm, don't, etc."])
 
     def test_converter(self):
         fn_tokenizer = PyOrtFunction.from_customop("CLIPTokenizer",