diff --git a/src/canopy/tokenizer/openai.py b/src/canopy/tokenizer/openai.py index 180cd4fd..2c00256a 100644 --- a/src/canopy/tokenizer/openai.py +++ b/src/canopy/tokenizer/openai.py @@ -7,12 +7,12 @@ class OpenAITokenizer(BaseTokenizer): """ Tokenizer for OpenAI models, based on the tiktoken library. - + Usage: Initialize the singleton tokenizer with the OpenAITokenizer class: >>> from canopy.tokenizer import Tokenizer >>> Tokenizer.initialize(tokenizer_class=OpenAITokenizer, model_name="gpt-3.5-turbo") - + You can then use the tokenizer instance from anywhere in the code: >>> from canopy.tokenizer import Tokenizer >>> tokenizer = Tokenizer() @@ -25,6 +25,8 @@ class OpenAITokenizer(BaseTokenizer): def __init__(self, model_name: str = "gpt-3.5-turbo"): """ + Initialize the tokenizer. + Args: model_name: The name of the model to use. Defaults to "gpt-3.5-turbo". You can find the list of available models here: https://github.com/openai/tiktoken/blob/39f29cecdb6fc38d9a3434e5dd15e4de58cf3c80/tiktoken/model.py#L19C1-L19C18 @@ -79,10 +81,10 @@ def messages_token_count(self, messages: Messages) -> int: Count the number of tokens in a list of messages as expected to be counted by OpenAI models. Account for the overhead of the messages structure. Taken from: https://github.com/openai/openai-cookbook/.../How_to_format_inputs_to_ChatGPT_models.ipynb - + Args: messages: The list of messages to count the tokens of. - + Returns: The number of tokens in the messages, as expected to be counted by OpenAI models. """ # noqa: E501 diff --git a/src/canopy/tokenizer/tokenizer.py b/src/canopy/tokenizer/tokenizer.py index 508c23c9..4a5d08f5 100644 --- a/src/canopy/tokenizer/tokenizer.py +++ b/src/canopy/tokenizer/tokenizer.py @@ -2,6 +2,7 @@ from .openai import OpenAITokenizer from .base import BaseTokenizer +from ..models.data_models import Messages class Tokenizer: @@ -9,13 +10,13 @@ class Tokenizer: """ Singleton class for tokenization. The singleton behavior unify tokenization across the system. - + Usage: - + To initialize the tokenizer, call Tokenizer.initialize(tokenizer_class, *args, **kwargs) >>> from canopy.tokenizer import Tokenizer >>> Tokenizer.initialize() - + Then, you can init a tokenizer instance by calling Tokenizer() from anywhere in the code and use it: >>> tokenizer = Tokenizer() >>> tokenizer.tokenize("Hello world!") @@ -49,9 +50,6 @@ def initialize(cls, tokenizer_class: The tokenizer class to use. Must be a subclass of BaseTokenizer. Defaults to OpenAITokenizer. **kwargs: Keyword arguments to pass to the tokenizer class constructor. - Returns: - None - Examples: Initialize the tokenizer with the default tokenizer class: @@ -97,7 +95,7 @@ def initialize_from_config(cls, config: dict): """ Initialize the tokenizer singleton from a config dictionary. Used by the config module to initialize the tokenizer from a config file. - + Args: config: A dictionary containing the tokenizer configuration. Must contain a "type" key with the tokenizer class name. """ # noqa: E501 @@ -128,7 +126,7 @@ def detokenize(self, tokens: List[str]) -> str: Returns: The joined text as a string. - """ + """ # noqa: E501 return self._tokenizer_instance.detokenize(tokens) # type: ignore[union-attr] def token_count(self, text: str) -> int: @@ -143,16 +141,16 @@ def token_count(self, text: str) -> int: """ return self._tokenizer_instance.token_count(text) # type: ignore[union-attr] - def messages_token_count(self, messages) -> int: + def messages_token_count(self, messages: Messages) -> int: """ Counts the number of tokens in a Messages object. Behind the scenes, for each LLM provider there might be a different overhead for each message in the prompt, which is not necessarily the same as the number of tokens in the message text. This method takes care of that overhead and returns the total number of tokens in the prompt, as counted by the LLM provider. - + Args: messages: The Messages object to count. - + Returns: The number of tokens in the Messages object. """ # noqa: E501