Skip to content

Commit

Permalink
🚧 fix(wip): add text chunker
Browse files Browse the repository at this point in the history
  • Loading branch information
mxchinegod committed Mar 26, 2024
1 parent d23b72b commit 31f8d9c
Showing 1 changed file with 31 additions and 0 deletions.
31 changes: 31 additions & 0 deletions magnet/utils/globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,38 @@
import boto3
from spacy.lang.en import English
import inspect
from transformers import AutoTokenizer

def break_into_chunks(text, model_name, max_tokens):
"""
Break a text into chunks of a specified max number of tokens using the tokenizer of a given model.
Parameters:
- text (str): The text to break into chunks.
- model_name (str): The model or tokenizer name to use for tokenizing the text.
- max_tokens (int): The maximum number of tokens per chunk.
Returns:
- List[str]: A list of text chunks, each with up to max_tokens tokens.
"""
# Load the tokenizer for the specified model
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokens = tokenizer.tokenize(text)

current_chunk = []
chunks = []

for token in tokens:
current_chunk.append(token)
if len(current_chunk) == max_tokens:
chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
current_chunk = []

if current_chunk:
chunks.append(tokenizer.convert_tokens_to_string(current_chunk))

return chunks

def reversal():
return inspect.getsource(inspect.currentframe().f_back)
Expand Down

0 comments on commit 31f8d9c

Please sign in to comment.