Skip to content

Commit

Permalink
Address the feedback on the tokenizer's library (#7024)
Browse files Browse the repository at this point in the history
* Fix cache when calling EncodeToIds

* Make EnglishRoberta _mergeRanks thread safe

* Delete Trainer

* Remove the setters on the Bpe properties

* Remove Roberta and Tiktoken special casing in the Tokenizer and support the cases in the Model abstraction

* Support text-embedding-3-small/large embedding

* Remove redundant TokenToId abstraction and keep the one with the extra parameters

* Enable creating Tiktoken asynchronously or directly using the tokenizer data

* Add cancellationToken support in CreateAsync APIs

* Rename sequence to text and Tokenize to Encode

* Rename skipSpecialTokens to considerSpecialTokens

* Rename TokenizerResult to EncodingResult

* Make Token publicly immutable

* Change offset tuples from (Index, End) to (Index, Length)

* Rename NormalizedString method's parameters

* Rename Model's methods to start with verb

* Convert  Model.GetVocab() method to a Vocab property

* Some method's parameters and variable renaming

* Remove Vocab and VocabSize from the abstraction

* Cleanup normalization support

* Minor Bpe cleanup

* Resolve rebase change

* Address the feedback
  • Loading branch information
tarekgh authored Feb 26, 2024
1 parent 4b89d98 commit d0aa2c2
Show file tree
Hide file tree
Showing 31 changed files with 838 additions and 6,033 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,16 @@ namespace Microsoft.ML.Tokenizers
/// <summary>
/// The Encoding represents the output of a Tokenizer.
/// </summary>
public sealed class TokenizerResult
public sealed class EncodingResult
{
/// <summary>
/// Create a new object of the TokenizerResult object.
/// Create a new object of the EncodingResult object.
/// </summary>
/// <param name="originalString">The list of tokens to merge.</param>
/// <param name="normalizedString">The list of tokens to merge.</param>
/// <param name="splits">The list of tokens to merge.</param>
/// <param name="offsetsMappedToOriginalString">Indicate whether the offsets is mapped to the original string or the normalized string.</param>
public TokenizerResult(string originalString, string normalizedString, IEnumerable<Split> splits, bool offsetsMappedToOriginalString)
public EncodingResult(string originalString, string normalizedString, IEnumerable<Split> splits, bool offsetsMappedToOriginalString)
{
OriginalString = originalString;
NormalizedString = normalizedString;
Expand All @@ -47,7 +47,7 @@ public TokenizerResult(string originalString, string normalizedString, IEnumerab
private List<Token>? _tokens;
private List<string>? _tokensWords;
private List<int>? _ids;
private List<(int Index, int End)>? _offsets;
private List<(int Index, int Length)>? _offsets;

internal void AddTokens(IReadOnlyList<Token> addedTokens)
{
Expand Down Expand Up @@ -121,10 +121,10 @@ public IReadOnlyList<string> Tokens
}

/// <summary>
/// Gets The list of offsets. These offsets lets you slice the input string, and thus retrieve
/// Gets The list of offsets. These offsets let's you slice the input string, and thus retrieve
/// the original part that led to producing the corresponding token.
/// </summary>
public IReadOnlyList<(int Index, int End)> Offsets
public IReadOnlyList<(int Index, int Length)> Offsets
{
get
{
Expand All @@ -138,7 +138,7 @@ public IReadOnlyList<string> Tokens
return Array.Empty<(int, int)>();
}

_offsets = new List<(int Index, int End)>(_tokens.Count);
_offsets = new List<(int Index, int Length)>(_tokens.Count);

foreach (var token in _tokens)
{
Expand Down
280 changes: 137 additions & 143 deletions src/Microsoft.ML.Tokenizers/Model/BPE.cs

Large diffs are not rendered by default.

Loading

0 comments on commit d0aa2c2

Please sign in to comment.