From 49a893549296067c7261e14369fc28b7ebbc7aad Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Wed, 1 Nov 2023 12:03:26 +0200 Subject: [PATCH 1/7] add docstring to tokenizer singelthon --- src/canopy/tokenizer/tokenizer.py | 108 +++++++++++++++++++++++++++++- 1 file changed, 106 insertions(+), 2 deletions(-) diff --git a/src/canopy/tokenizer/tokenizer.py b/src/canopy/tokenizer/tokenizer.py index 7e9d53f6..508c23c9 100644 --- a/src/canopy/tokenizer/tokenizer.py +++ b/src/canopy/tokenizer/tokenizer.py @@ -1,10 +1,29 @@ -from typing import List, Optional +from typing import List, Optional, Type from .openai import OpenAITokenizer from .base import BaseTokenizer class Tokenizer: + + """ + Singleton class for tokenization. + The singleton behavior unify tokenization across the system. + + Usage: + + To initialize the tokenizer, call Tokenizer.initialize(tokenizer_class, *args, **kwargs) + >>> from canopy.tokenizer import Tokenizer + >>> Tokenizer.initialize() + + Then, you can init a tokenizer instance by calling Tokenizer() from anywhere in the code and use it: + >>> tokenizer = Tokenizer() + >>> tokenizer.tokenize("Hello world!") + ['Hello', 'world', '!'] + >>> tokenizer.detokenize(['Hello', 'world', '!']) + 'Hello world!' + """ # noqa: E501 + _instance = None _tokenizer_instance: Optional[BaseTokenizer] = None _initialized = False @@ -20,7 +39,43 @@ def __new__(cls): return cls._instance @classmethod - def initialize(cls, tokenizer_class=DEFAULT_TOKENIZER_CLASS, **kwargs): + def initialize(cls, + tokenizer_class: Type[BaseTokenizer] = DEFAULT_TOKENIZER_CLASS, + **kwargs): + """ + Initialize the tokenizer singleton. + + Args: + tokenizer_class: The tokenizer class to use. Must be a subclass of BaseTokenizer. Defaults to OpenAITokenizer. + **kwargs: Keyword arguments to pass to the tokenizer class constructor. + + Returns: + None + + Examples: + Initialize the tokenizer with the default tokenizer class: + + >>> from canopy.tokenizer import Tokenizer + >>> Tokenizer.initialize() + + Initialize the tokenizer with a custom tokenizer class: + + >>> from canopy.tokenizer import Tokenizer + >>> from canopy.tokenizer.base import BaseTokenizer + >>> class MyTokenizer(BaseTokenizer): + ... def tokenize(self, text: str) -> List[str]: + ... return text.split() + ... def detokenize(self, tokens: List[str]) -> str: + ... return " ".join(tokens) + ... def messages_token_count(self, messages) -> int: + ... return sum([self.token_count(message) + 3 for message in messages]) + >>> Tokenizer.initialize(MyTokenizer) + + Then, you can init a tokenizer instance by calling Tokenizer() from anywhere in the code: + + >>> from canopy.tokenizer import Tokenizer + >>> tokenizer = Tokenizer() + """ # noqa: E501 if not issubclass(tokenizer_class, BaseTokenizer): raise ValueError("Invalid tokenizer class provided") if issubclass(tokenizer_class, Tokenizer): @@ -30,12 +85,22 @@ def initialize(cls, tokenizer_class=DEFAULT_TOKENIZER_CLASS, **kwargs): @classmethod def clear(cls): + """ + Clear the tokenizer singleton. + """ cls._instance = None cls._tokenizer_instance = None cls._initialized = False @classmethod def initialize_from_config(cls, config: dict): + """ + Initialize the tokenizer singleton from a config dictionary. + Used by the config module to initialize the tokenizer from a config file. + + Args: + config: A dictionary containing the tokenizer configuration. Must contain a "type" key with the tokenizer class name. + """ # noqa: E501 if cls._initialized: raise ValueError("Tokenizer has already been initialized") config["type"] = config.get("type", cls.DEFAULT_TOKENIZER_CLASS.__name__) @@ -43,13 +108,52 @@ def initialize_from_config(cls, config: dict): cls._initialized = True def tokenize(self, text: str) -> List[str]: + """ + Splits a text into tokens. + + Args: + text: The text to tokenize as a string. + + Returns: + A list of tokens. + """ return self._tokenizer_instance.tokenize(text) # type: ignore[union-attr] def detokenize(self, tokens: List[str]) -> str: + """ + Joins a list of tokens into a text. + + Args: + tokens: The tokens to join as a list of strings. Consider using tokenize() first. + + Returns: + The joined text as a string. + """ return self._tokenizer_instance.detokenize(tokens) # type: ignore[union-attr] def token_count(self, text: str) -> int: + """ + Counts the number of tokens in a text. + + Args: + text: The text to count as a string. + + Returns: + The number of tokens in the text. + """ return self._tokenizer_instance.token_count(text) # type: ignore[union-attr] def messages_token_count(self, messages) -> int: + """ + Counts the number of tokens in a Messages object. + Behind the scenes, for each LLM provider there might be a different overhead for each message in the prompt, + which is not necessarily the same as the number of tokens in the message text. + This method takes care of that overhead and returns the total number of tokens in the prompt, as counted by the LLM provider. + + Args: + messages: The Messages object to count. + + Returns: + The number of tokens in the Messages object. + """ # noqa: E501 return self._tokenizer_instance.messages_token_count(messages) # type: ignore[union-attr] # noqa: E501 From 791abd81ce1c07e713a03423f3c95d37495ae9df Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Wed, 1 Nov 2023 12:13:18 +0200 Subject: [PATCH 2/7] add docstrings to openai tokenizer --- src/canopy/tokenizer/openai.py | 59 +++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/src/canopy/tokenizer/openai.py b/src/canopy/tokenizer/openai.py index 069925d9..180cd4fd 100644 --- a/src/canopy/tokenizer/openai.py +++ b/src/canopy/tokenizer/openai.py @@ -5,30 +5,87 @@ class OpenAITokenizer(BaseTokenizer): + """ + Tokenizer for OpenAI models, based on the tiktoken library. + + Usage: + Initialize the singleton tokenizer with the OpenAITokenizer class: + >>> from canopy.tokenizer import Tokenizer + >>> Tokenizer.initialize(tokenizer_class=OpenAITokenizer, model_name="gpt-3.5-turbo") + + You can then use the tokenizer instance from anywhere in the code: + >>> from canopy.tokenizer import Tokenizer + >>> tokenizer = Tokenizer() + >>> tokenizer.tokenize("Hello world!") + ['Hello', ' world', '!'] + """ # noqa: E501 MESSAGE_TOKENS_OVERHEAD = 3 FIXED_PREFIX_TOKENS = 3 def __init__(self, model_name: str = "gpt-3.5-turbo"): + """ + Args: + model_name: The name of the model to use. Defaults to "gpt-3.5-turbo". + You can find the list of available models here: https://github.com/openai/tiktoken/blob/39f29cecdb6fc38d9a3434e5dd15e4de58cf3c80/tiktoken/model.py#L19C1-L19C18 + As you can see, both gpt-3.5 and gpt-4 are using the same cl100k_base tokenizer. + """ # noqa: E501 self._encoder = tiktoken.encoding_for_model(model_name) def tokenize(self, text: str) -> List[str]: + """ + Tokenize a text using tiktoken. + + Args: + text: The text to tokenize. + + Returns: + The list of tokens. + """ return [self._encoder.decode([encoded_token]) for encoded_token in self._encode(text)] def detokenize(self, tokens: List[str]) -> str: + """ + Detokenize a list of tokens that were previously tokenized using this tokenizer. + + Args: + tokens: The list of tokens to detokenize. + + Returns: + The detokenized text as a string. + """ if not isinstance(tokens, List): raise TypeError(f"detokenize expect List[str], got f{type(tokens)}") return "".join(tokens) def token_count(self, text: str) -> int: + """ + Count the number of tokens in a text. + + Args: + text: The text to count the tokens of. + + Returns: + The number of tokens in the text. + """ return len(self._encode(text)) def _encode(self, text): return self._encoder.encode(text, disallowed_special=()) def messages_token_count(self, messages: Messages) -> int: - # Adapted from: https://github.com/openai/openai-cookbook/.../How_to_format_inputs_to_ChatGPT_models.ipynb # noqa + """ + Count the number of tokens in a list of messages as expected to be counted by OpenAI models. + Account for the overhead of the messages structure. + Taken from: https://github.com/openai/openai-cookbook/.../How_to_format_inputs_to_ChatGPT_models.ipynb + + Args: + messages: The list of messages to count the tokens of. + + Returns: + The number of tokens in the messages, as expected to be counted by OpenAI models. + """ # noqa: E501 num_tokens = 0 for message in messages: num_tokens += self.MESSAGE_TOKENS_OVERHEAD From c0ed6d3591af66f5a3297eb582dfdad082cdb3de Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Wed, 1 Nov 2023 12:17:31 +0200 Subject: [PATCH 3/7] lint --- src/canopy/tokenizer/openai.py | 10 ++++++---- src/canopy/tokenizer/tokenizer.py | 20 +++++++++----------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/canopy/tokenizer/openai.py b/src/canopy/tokenizer/openai.py index 180cd4fd..2c00256a 100644 --- a/src/canopy/tokenizer/openai.py +++ b/src/canopy/tokenizer/openai.py @@ -7,12 +7,12 @@ class OpenAITokenizer(BaseTokenizer): """ Tokenizer for OpenAI models, based on the tiktoken library. - + Usage: Initialize the singleton tokenizer with the OpenAITokenizer class: >>> from canopy.tokenizer import Tokenizer >>> Tokenizer.initialize(tokenizer_class=OpenAITokenizer, model_name="gpt-3.5-turbo") - + You can then use the tokenizer instance from anywhere in the code: >>> from canopy.tokenizer import Tokenizer >>> tokenizer = Tokenizer() @@ -25,6 +25,8 @@ class OpenAITokenizer(BaseTokenizer): def __init__(self, model_name: str = "gpt-3.5-turbo"): """ + Initialize the tokenizer. + Args: model_name: The name of the model to use. Defaults to "gpt-3.5-turbo". You can find the list of available models here: https://github.com/openai/tiktoken/blob/39f29cecdb6fc38d9a3434e5dd15e4de58cf3c80/tiktoken/model.py#L19C1-L19C18 @@ -79,10 +81,10 @@ def messages_token_count(self, messages: Messages) -> int: Count the number of tokens in a list of messages as expected to be counted by OpenAI models. Account for the overhead of the messages structure. Taken from: https://github.com/openai/openai-cookbook/.../How_to_format_inputs_to_ChatGPT_models.ipynb - + Args: messages: The list of messages to count the tokens of. - + Returns: The number of tokens in the messages, as expected to be counted by OpenAI models. """ # noqa: E501 diff --git a/src/canopy/tokenizer/tokenizer.py b/src/canopy/tokenizer/tokenizer.py index 508c23c9..4a5d08f5 100644 --- a/src/canopy/tokenizer/tokenizer.py +++ b/src/canopy/tokenizer/tokenizer.py @@ -2,6 +2,7 @@ from .openai import OpenAITokenizer from .base import BaseTokenizer +from ..models.data_models import Messages class Tokenizer: @@ -9,13 +10,13 @@ class Tokenizer: """ Singleton class for tokenization. The singleton behavior unify tokenization across the system. - + Usage: - + To initialize the tokenizer, call Tokenizer.initialize(tokenizer_class, *args, **kwargs) >>> from canopy.tokenizer import Tokenizer >>> Tokenizer.initialize() - + Then, you can init a tokenizer instance by calling Tokenizer() from anywhere in the code and use it: >>> tokenizer = Tokenizer() >>> tokenizer.tokenize("Hello world!") @@ -49,9 +50,6 @@ def initialize(cls, tokenizer_class: The tokenizer class to use. Must be a subclass of BaseTokenizer. Defaults to OpenAITokenizer. **kwargs: Keyword arguments to pass to the tokenizer class constructor. - Returns: - None - Examples: Initialize the tokenizer with the default tokenizer class: @@ -97,7 +95,7 @@ def initialize_from_config(cls, config: dict): """ Initialize the tokenizer singleton from a config dictionary. Used by the config module to initialize the tokenizer from a config file. - + Args: config: A dictionary containing the tokenizer configuration. Must contain a "type" key with the tokenizer class name. """ # noqa: E501 @@ -128,7 +126,7 @@ def detokenize(self, tokens: List[str]) -> str: Returns: The joined text as a string. - """ + """ # noqa: E501 return self._tokenizer_instance.detokenize(tokens) # type: ignore[union-attr] def token_count(self, text: str) -> int: @@ -143,16 +141,16 @@ def token_count(self, text: str) -> int: """ return self._tokenizer_instance.token_count(text) # type: ignore[union-attr] - def messages_token_count(self, messages) -> int: + def messages_token_count(self, messages: Messages) -> int: """ Counts the number of tokens in a Messages object. Behind the scenes, for each LLM provider there might be a different overhead for each message in the prompt, which is not necessarily the same as the number of tokens in the message text. This method takes care of that overhead and returns the total number of tokens in the prompt, as counted by the LLM provider. - + Args: messages: The Messages object to count. - + Returns: The number of tokens in the Messages object. """ # noqa: E501 From cfee71c84d7be294ca564d6f150975f7424e9654 Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Thu, 2 Nov 2023 12:26:27 +0200 Subject: [PATCH 4/7] Update src/canopy/tokenizer/tokenizer.py Co-authored-by: igiloh-pinecone <118673156+igiloh-pinecone@users.noreply.github.com> --- src/canopy/tokenizer/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/canopy/tokenizer/tokenizer.py b/src/canopy/tokenizer/tokenizer.py index 4a5d08f5..2724a124 100644 --- a/src/canopy/tokenizer/tokenizer.py +++ b/src/canopy/tokenizer/tokenizer.py @@ -17,7 +17,7 @@ class Tokenizer: >>> from canopy.tokenizer import Tokenizer >>> Tokenizer.initialize() - Then, you can init a tokenizer instance by calling Tokenizer() from anywhere in the code and use it: + Then, you can instantiate a tokenizer instance by calling Tokenizer() from anywhere in the code and use it: >>> tokenizer = Tokenizer() >>> tokenizer.tokenize("Hello world!") ['Hello', 'world', '!'] From 59eb77ef5971fb70a97387e4d18d34c689d36805 Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Thu, 2 Nov 2023 12:26:33 +0200 Subject: [PATCH 5/7] Update src/canopy/tokenizer/tokenizer.py Co-authored-by: igiloh-pinecone <118673156+igiloh-pinecone@users.noreply.github.com> --- src/canopy/tokenizer/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/canopy/tokenizer/tokenizer.py b/src/canopy/tokenizer/tokenizer.py index 2724a124..af4b414f 100644 --- a/src/canopy/tokenizer/tokenizer.py +++ b/src/canopy/tokenizer/tokenizer.py @@ -69,7 +69,7 @@ def initialize(cls, ... return sum([self.token_count(message) + 3 for message in messages]) >>> Tokenizer.initialize(MyTokenizer) - Then, you can init a tokenizer instance by calling Tokenizer() from anywhere in the code: + Then, you can instantiate a tokenizer instance by calling Tokenizer() from anywhere in the code: >>> from canopy.tokenizer import Tokenizer >>> tokenizer = Tokenizer() From 167569fcd397be8102c81312ac448eb9733f6013 Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Thu, 2 Nov 2023 12:27:04 +0200 Subject: [PATCH 6/7] Update src/canopy/tokenizer/tokenizer.py Co-authored-by: igiloh-pinecone <118673156+igiloh-pinecone@users.noreply.github.com> --- src/canopy/tokenizer/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/canopy/tokenizer/tokenizer.py b/src/canopy/tokenizer/tokenizer.py index af4b414f..8f4b6e0f 100644 --- a/src/canopy/tokenizer/tokenizer.py +++ b/src/canopy/tokenizer/tokenizer.py @@ -48,7 +48,7 @@ def initialize(cls, Args: tokenizer_class: The tokenizer class to use. Must be a subclass of BaseTokenizer. Defaults to OpenAITokenizer. - **kwargs: Keyword arguments to pass to the tokenizer class constructor. + **kwargs: Keyword arguments to pass to the underlying `Tokenizer` class constructor. Examples: Initialize the tokenizer with the default tokenizer class: From 5e1ad0b1764f33a80e4701a71e5e17e73c77fc27 Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Thu, 2 Nov 2023 12:31:18 +0200 Subject: [PATCH 7/7] add example --- src/canopy/tokenizer/tokenizer.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/canopy/tokenizer/tokenizer.py b/src/canopy/tokenizer/tokenizer.py index 4a5d08f5..61d1e90f 100644 --- a/src/canopy/tokenizer/tokenizer.py +++ b/src/canopy/tokenizer/tokenizer.py @@ -97,7 +97,15 @@ def initialize_from_config(cls, config: dict): Used by the config module to initialize the tokenizer from a config file. Args: - config: A dictionary containing the tokenizer configuration. Must contain a "type" key with the tokenizer class name. + config: A dictionary containing the tokenizer configuration. If not provided, the OpenAITokenizer will be used. + + Usage: + >>> from canopy.tokenizer import Tokenizer + >>> config = { + ... "type": "OpenAITokenizer", + ... "model_name": "gpt2" + ... } + >>> Tokenizer.initialize_from_config(config) """ # noqa: E501 if cls._initialized: raise ValueError("Tokenizer has already been initialized")