Skip to content

Commit

Permalink
Merge branch 'main' into skottmckay/AddAbilityToBuildDummyNugetPackag…
Browse files Browse the repository at this point in the history
…eWithNoOps
  • Loading branch information
skottmckay authored Oct 19, 2023
2 parents 80cb4a0 + 4d2930e commit 8544e46
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 3 deletions.
4 changes: 4 additions & 0 deletions operators/tokenizer/bpe_kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,
if (IsUnicodeSpace(str.back())) {
str.pop_back();
}
// remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned)
str.erase(std::remove(str.begin(), str.end(), '\n'), str.end());
str.erase(std::remove(str.begin(), str.end(), '\r'), str.end());
input = str;
}

Expand Down Expand Up @@ -196,6 +199,7 @@ std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,

while (static_cast<int64_t>(res.size()) < max_length) {
auto [b, tok] = regcmp.GetNextToken();

if (!b) break;

std::string utf8_token = std::string(ustring(tok));
Expand Down
1 change: 1 addition & 0 deletions operators/tokenizer/bpe_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ class TokenWithRegularExp {

private:
std::u32string_view TryMatch() {

// python pattern:
// 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+

Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# include requirements.txt so pip has context to avoid installing incompatible dependencies
-r requirements.txt
pytest
ftfy
# multiple versions of onnxruntime are supported, but only one can be installed at a time
protobuf < 4.0.0
onnxruntime >=1.12.0
Expand Down
14 changes: 11 additions & 3 deletions test/test_autotokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import unittest

import numpy as np
import ftfy
from transformers import AutoTokenizer, GPT2Tokenizer
from onnxruntime_extensions import OrtPyFunction, gen_processing_models, ort_inference, util

Expand Down Expand Up @@ -49,7 +50,10 @@ def test_t5_tokenizer(self):

def test_roberta_base(self):
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions", use_fast=False)
text = "Agree. Keep trying, then if your rejected every time. I'm sorry your done."
text = """
Agree. Keep trying, then if your rejected every time. I'm sorry your done.
Testing words with apostrophes such as you're, i'm, don't, etc.
"""
ids = tokenizer.encode(text, return_tensors="np")
m_tok, m_detok = gen_processing_models(tokenizer, pre_kwargs={}, post_kwargs={})

Expand All @@ -60,7 +64,11 @@ def test_roberta_base(self):

def test_clip_tokenizer(self):
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32", use_fast=False)
text = "Wow, these models are getting popular."
text = """
1. Testing long text with multiple lines to check newline handling
2. As well as words with apostrophes such as you're, i'm, don't, etc.
3. And weird characters such as . , ~ ? ( ) " [ ] ! : - .
"""
ids = tokenizer.encode(text, return_tensors="np")

ort_tok = OrtPyFunction.from_model(gen_processing_models(
Expand All @@ -71,7 +79,7 @@ def test_clip_tokenizer(self):

def test_gpt2_tokenizer(self):
tokenizer = GPT2Tokenizer.from_pretrained("Xenova/gpt-4", use_fast=False)
text = "Deep learning has come a long way, no?"
text = "Testing words with apostrophes such as you're, i'm, don't, etc."
ids = tokenizer.encode(text, return_tensors="np")

ort_tok = OrtPyFunction.from_model(gen_processing_models(
Expand Down
2 changes: 2 additions & 0 deletions test/test_cliptok.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import unittest
import numpy as np
import onnxruntime as _ort
import ftfy

from pathlib import Path
from onnx import helper, onnx_pb as onnx_proto
Expand Down Expand Up @@ -105,6 +106,7 @@ def test_tokenizer(self):
self._run_tokenizer(["Testing multiple sequences of spaces"])
self._run_tokenizer([" in the beginning and the end. "])
self._run_tokenizer([" "])
self._run_tokenizer(["Testing words with apostrophes such as you're, i'm, don't, etc."])

def test_converter(self):
fn_tokenizer = PyOrtFunction.from_customop("CLIPTokenizer",
Expand Down

0 comments on commit 8544e46

Please sign in to comment.