Skip to content
This repository has been archived by the owner on Nov 13, 2024. It is now read-only.

Commit

Permalink
lint
Browse files Browse the repository at this point in the history
  • Loading branch information
acatav committed Nov 1, 2023
1 parent 791abd8 commit c0ed6d3
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 15 deletions.
10 changes: 6 additions & 4 deletions src/canopy/tokenizer/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
class OpenAITokenizer(BaseTokenizer):
"""
Tokenizer for OpenAI models, based on the tiktoken library.
Usage:
Initialize the singleton tokenizer with the OpenAITokenizer class:
>>> from canopy.tokenizer import Tokenizer
>>> Tokenizer.initialize(tokenizer_class=OpenAITokenizer, model_name="gpt-3.5-turbo")
You can then use the tokenizer instance from anywhere in the code:
>>> from canopy.tokenizer import Tokenizer
>>> tokenizer = Tokenizer()
Expand All @@ -25,6 +25,8 @@ class OpenAITokenizer(BaseTokenizer):

def __init__(self, model_name: str = "gpt-3.5-turbo"):
"""
Initialize the tokenizer.
Args:
model_name: The name of the model to use. Defaults to "gpt-3.5-turbo".
You can find the list of available models here: https://github.com/openai/tiktoken/blob/39f29cecdb6fc38d9a3434e5dd15e4de58cf3c80/tiktoken/model.py#L19C1-L19C18
Expand Down Expand Up @@ -79,10 +81,10 @@ def messages_token_count(self, messages: Messages) -> int:
Count the number of tokens in a list of messages as expected to be counted by OpenAI models.
Account for the overhead of the messages structure.
Taken from: https://github.com/openai/openai-cookbook/.../How_to_format_inputs_to_ChatGPT_models.ipynb
Args:
messages: The list of messages to count the tokens of.
Returns:
The number of tokens in the messages, as expected to be counted by OpenAI models.
""" # noqa: E501
Expand Down
20 changes: 9 additions & 11 deletions src/canopy/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,21 @@

from .openai import OpenAITokenizer
from .base import BaseTokenizer
from ..models.data_models import Messages


class Tokenizer:

"""
Singleton class for tokenization.
The singleton behavior unify tokenization across the system.
Usage:
To initialize the tokenizer, call Tokenizer.initialize(tokenizer_class, *args, **kwargs)
>>> from canopy.tokenizer import Tokenizer
>>> Tokenizer.initialize()
Then, you can init a tokenizer instance by calling Tokenizer() from anywhere in the code and use it:
>>> tokenizer = Tokenizer()
>>> tokenizer.tokenize("Hello world!")
Expand Down Expand Up @@ -49,9 +50,6 @@ def initialize(cls,
tokenizer_class: The tokenizer class to use. Must be a subclass of BaseTokenizer. Defaults to OpenAITokenizer.
**kwargs: Keyword arguments to pass to the tokenizer class constructor.
Returns:
None
Examples:
Initialize the tokenizer with the default tokenizer class:
Expand Down Expand Up @@ -97,7 +95,7 @@ def initialize_from_config(cls, config: dict):
"""
Initialize the tokenizer singleton from a config dictionary.
Used by the config module to initialize the tokenizer from a config file.
Args:
config: A dictionary containing the tokenizer configuration. Must contain a "type" key with the tokenizer class name.
""" # noqa: E501
Expand Down Expand Up @@ -128,7 +126,7 @@ def detokenize(self, tokens: List[str]) -> str:
Returns:
The joined text as a string.
"""
""" # noqa: E501
return self._tokenizer_instance.detokenize(tokens) # type: ignore[union-attr]

def token_count(self, text: str) -> int:
Expand All @@ -143,16 +141,16 @@ def token_count(self, text: str) -> int:
"""
return self._tokenizer_instance.token_count(text) # type: ignore[union-attr]

def messages_token_count(self, messages) -> int:
def messages_token_count(self, messages: Messages) -> int:
"""
Counts the number of tokens in a Messages object.
Behind the scenes, for each LLM provider there might be a different overhead for each message in the prompt,
which is not necessarily the same as the number of tokens in the message text.
This method takes care of that overhead and returns the total number of tokens in the prompt, as counted by the LLM provider.
Args:
messages: The Messages object to count.
Returns:
The number of tokens in the Messages object.
""" # noqa: E501
Expand Down

0 comments on commit c0ed6d3

Please sign in to comment.