From 99cdf22f79ecd82960de8a4e688cc4bf169e7743 Mon Sep 17 00:00:00 2001 From: OlivierDehaene Date: Fri, 21 Jun 2024 14:04:38 +0200 Subject: [PATCH] feat(router): add truncation direction parameter (#299) --- Cargo.toml | 10 +++- Dockerfile-cuda | 5 ++ Dockerfile-cuda-all | 4 +- core/src/infer.rs | 37 +++++++++++-- core/src/tokenization.rs | 15 ++++- proto/tei.proto | 12 ++++ router/src/grpc/server.rs | 112 +++++++++++++++++++++++++++++++++----- router/src/http/server.rs | 69 ++++++++++++++--------- router/src/http/types.rs | 16 ++++++ 9 files changed, 228 insertions(+), 52 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7d7be46e..2338c6f3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,12 +24,18 @@ candle-transformers = { git = "https://github.com/OlivierDehaene/candle", rev = candle-flash-attn = { git = "https://github.com/OlivierDehaene/candle", rev = "33b7ecf9ed82bb7c20f1a94555218fabfbaa2fe3", package = "candle-flash-attn" } hf-hub = { git = "https://github.com/huggingface/hf-hub", rev = "b167f69692be5f49eb8003788f7f8a499a98b096" } - [profile.release] debug = 0 -incremental = true lto = "fat" opt-level = 3 codegen-units = 1 strip = "symbols" panic = "abort" + +[profile.release-debug] +inherits = "release" +debug = 1 +lto = "thin" +codegen-units = 16 +strip = "none" +incremental = true diff --git a/Dockerfile-cuda b/Dockerfile-cuda index 990e4261..d67eba93 100644 --- a/Dockerfile-cuda +++ b/Dockerfile-cuda @@ -35,6 +35,11 @@ ARG CUDA_COMPUTE_CAP=80 ARG GIT_SHA ARG DOCKER_LABEL +# Limit parallelism +ARG RAYON_NUM_THREADS +ARG CARGO_BUILD_JOBS +ARG CARGO_BUILD_INCREMENTAL + # sccache specific variables ARG ACTIONS_CACHE_URL ARG ACTIONS_RUNTIME_TOKEN diff --git a/Dockerfile-cuda-all b/Dockerfile-cuda-all index c7ed2e5b..71321705 100644 --- a/Dockerfile-cuda-all +++ b/Dockerfile-cuda-all @@ -40,8 +40,10 @@ ARG ACTIONS_CACHE_URL ARG ACTIONS_RUNTIME_TOKEN ARG SCCACHE_GHA_ENABLED -# limit the number of kernels built at the same time +# Limit parallelism ARG RAYON_NUM_THREADS=4 +ARG CARGO_BUILD_JOBS +ARG CARGO_BUILD_INCREMENTAL WORKDIR /usr/src diff --git a/core/src/infer.rs b/core/src/infer.rs index 54f755d9..0f95ff8b 100644 --- a/core/src/infer.rs +++ b/core/src/infer.rs @@ -4,6 +4,7 @@ use crate::TextEmbeddingsError; use std::sync::Arc; use std::time::{Duration, Instant}; use text_embeddings_backend::{Backend, BackendError, Embedding, ModelType}; +use tokenizers::TruncationDirection; use tokio::sync::{mpsc, oneshot, watch, Notify, OwnedSemaphorePermit, Semaphore}; use tracing::instrument; @@ -117,6 +118,7 @@ impl Infer { &self, inputs: I, truncate: bool, + truncation_direction: TruncationDirection, permit: OwnedSemaphorePermit, ) -> Result { let start_time = Instant::now(); @@ -131,7 +133,14 @@ impl Infer { } let results = self - .embed(inputs, truncate, false, &start_time, permit) + .embed( + inputs, + truncate, + truncation_direction, + false, + &start_time, + permit, + ) .await?; let InferResult::AllEmbedding(response) = results else { @@ -165,6 +174,7 @@ impl Infer { &self, inputs: I, truncate: bool, + truncation_direction: TruncationDirection, permit: OwnedSemaphorePermit, ) -> Result { let start_time = Instant::now(); @@ -179,7 +189,14 @@ impl Infer { } let results = self - .embed(inputs, truncate, true, &start_time, permit) + .embed( + inputs, + truncate, + truncation_direction, + true, + &start_time, + permit, + ) .await?; let InferResult::PooledEmbedding(response) = results else { @@ -213,6 +230,7 @@ impl Infer { &self, inputs: I, truncate: bool, + truncation_direction: TruncationDirection, normalize: bool, permit: OwnedSemaphorePermit, ) -> Result { @@ -228,7 +246,14 @@ impl Infer { } let results = self - .embed(inputs, truncate, true, &start_time, permit) + .embed( + inputs, + truncate, + truncation_direction, + true, + &start_time, + permit, + ) .await?; let InferResult::PooledEmbedding(mut response) = results else { @@ -278,6 +303,7 @@ impl Infer { &self, inputs: I, truncate: bool, + truncation_direction: TruncationDirection, pooling: bool, start_time: &Instant, _permit: OwnedSemaphorePermit, @@ -296,7 +322,7 @@ impl Infer { // Tokenization let encoding = self .tokenization - .encode(inputs.into(), truncate) + .encode(inputs.into(), truncate, truncation_direction) .await .map_err(|err| { metrics::increment_counter!("te_request_failure", "err" => "tokenization"); @@ -340,6 +366,7 @@ impl Infer { &self, inputs: I, truncate: bool, + truncation_direction: TruncationDirection, raw_scores: bool, _permit: OwnedSemaphorePermit, ) -> Result { @@ -357,7 +384,7 @@ impl Infer { // Tokenization let encoding = self .tokenization - .encode(inputs.into(), truncate) + .encode(inputs.into(), truncate, truncation_direction) .await .map_err(|err| { metrics::increment_counter!("te_request_failure", "err" => "tokenization"); diff --git a/core/src/tokenization.rs b/core/src/tokenization.rs index 46f1411e..07226823 100644 --- a/core/src/tokenization.rs +++ b/core/src/tokenization.rs @@ -64,6 +64,7 @@ impl Tokenization { &self, inputs: EncodingInput, truncate: bool, + truncation_direction: TruncationDirection, ) -> Result { // Check if inputs is empty if inputs.is_empty() { @@ -80,6 +81,7 @@ impl Tokenization { .send(TokenizerRequest::Encode( inputs, truncate, + truncation_direction, response_sender, Span::current(), )) @@ -163,7 +165,13 @@ fn tokenizer_worker( // Loop over requests while let Some(request) = receiver.blocking_recv() { match request { - TokenizerRequest::Encode(inputs, truncate, response_tx, parent_span) => { + TokenizerRequest::Encode( + inputs, + truncate, + truncation_direction, + response_tx, + parent_span, + ) => { parent_span.in_scope(|| { if !response_tx.is_closed() { // It's possible that the user dropped its request resulting in a send error. @@ -171,6 +179,7 @@ fn tokenizer_worker( let _ = response_tx.send(encode_input( inputs, truncate, + truncation_direction, max_input_length, position_offset, &mut tokenizer, @@ -247,13 +256,14 @@ fn tokenize_input( fn encode_input( inputs: EncodingInput, truncate: bool, + truncation_direction: TruncationDirection, max_input_length: usize, position_offset: usize, tokenizer: &mut Tokenizer, ) -> Result { // Default truncation params let truncate_params = truncate.then_some(TruncationParams { - direction: TruncationDirection::Right, + direction: truncation_direction, max_length: max_input_length, strategy: TruncationStrategy::LongestFirst, stride: 0, @@ -316,6 +326,7 @@ enum TokenizerRequest { Encode( EncodingInput, bool, + TruncationDirection, oneshot::Sender>, Span, ), diff --git a/proto/tei.proto b/proto/tei.proto index 6538e34a..394c0262 100644 --- a/proto/tei.proto +++ b/proto/tei.proto @@ -69,10 +69,16 @@ message Metadata { uint64 inference_time_ns = 6; } +enum TruncationDirection { + TRUNCATION_DIRECTION_RIGHT = 0; + TRUNCATION_DIRECTION_LEFT = 1; +} + message EmbedRequest { string inputs = 1; bool truncate = 2; bool normalize = 3; + TruncationDirection truncation_direction = 4; } message EmbedResponse { @@ -83,6 +89,7 @@ message EmbedResponse { message EmbedSparseRequest { string inputs = 1; bool truncate = 2; + TruncationDirection truncation_direction = 3; } message SparseValue { @@ -98,6 +105,7 @@ message EmbedSparseResponse { message EmbedAllRequest { string inputs = 1; bool truncate = 2; + TruncationDirection truncation_direction = 3; } message TokenEmbedding { @@ -113,12 +121,14 @@ message PredictRequest { string inputs = 1; bool truncate = 2; bool raw_scores = 3; + TruncationDirection truncation_direction = 4; } message PredictPairRequest { repeated string inputs = 1; bool truncate = 2; bool raw_scores = 3; + TruncationDirection truncation_direction = 4; } message Prediction { @@ -137,6 +147,7 @@ message RerankRequest { bool truncate = 3; bool raw_scores = 4; bool return_text = 5; + TruncationDirection truncation_direction = 6; } message RerankStreamRequest{ @@ -147,6 +158,7 @@ message RerankStreamRequest{ bool raw_scores = 4; // The server will only consider the first value bool return_text = 5; + TruncationDirection truncation_direction = 6; } message Rank { diff --git a/router/src/grpc/server.rs b/router/src/grpc/server.rs index 913455b3..98ee5601 100644 --- a/router/src/grpc/server.rs +++ b/router/src/grpc/server.rs @@ -1,7 +1,7 @@ use crate::grpc::pb::tei::v1::{ EmbedAllRequest, EmbedAllResponse, EmbedSparseRequest, EmbedSparseResponse, EncodeRequest, EncodeResponse, PredictPairRequest, RerankStreamRequest, SimpleToken, SparseValue, - TokenEmbedding, + TokenEmbedding, TruncationDirection, }; use crate::grpc::{ DecodeRequest, DecodeResponse, EmbedRequest, EmbedResponse, InfoRequest, InfoResponse, @@ -80,9 +80,16 @@ impl TextEmbeddingsService { let start_time = Instant::now(); let compute_chars = request.inputs.chars().count(); + let truncation_direction = convert_truncation_direction(request.truncation_direction); let response = self .infer - .embed_pooled(request.inputs, request.truncate, request.normalize, permit) + .embed_pooled( + request.inputs, + request.truncate, + truncation_direction, + request.normalize, + permit, + ) .await .map_err(ErrorResponse::from)?; @@ -128,9 +135,15 @@ impl TextEmbeddingsService { let start_time = Instant::now(); let compute_chars = request.inputs.chars().count(); + let truncation_direction = convert_truncation_direction(request.truncation_direction); let response = self .infer - .embed_sparse(request.inputs, request.truncate, permit) + .embed_sparse( + request.inputs, + request.truncate, + truncation_direction, + permit, + ) .await .map_err(ErrorResponse::from)?; @@ -187,9 +200,15 @@ impl TextEmbeddingsService { let start_time = Instant::now(); let compute_chars = request.inputs.chars().count(); + let truncation_direction = convert_truncation_direction(request.truncation_direction); let response = self .infer - .embed_all(request.inputs, request.truncate, permit) + .embed_all( + request.inputs, + request.truncate, + truncation_direction, + permit, + ) .await .map_err(ErrorResponse::from)?; @@ -236,6 +255,7 @@ impl TextEmbeddingsService { &self, inputs: I, truncate: bool, + truncation_direction: tokenizers::TruncationDirection, raw_scores: bool, permit: OwnedSemaphorePermit, ) -> Result<(PredictResponse, ResponseMetadata), Status> { @@ -251,7 +271,7 @@ impl TextEmbeddingsService { let response = self .infer - .predict(inputs, truncate, raw_scores, permit) + .predict(inputs, truncate, truncation_direction, raw_scores, permit) .await .map_err(ErrorResponse::from)?; @@ -701,8 +721,15 @@ impl grpc::predict_server::Predict for TextEmbeddingsService { .map_err(ErrorResponse::from)?; let request = request.into_inner(); + let truncation_direction = convert_truncation_direction(request.truncation_direction); let (response, metadata) = self - .predict_inner(request.inputs, request.truncate, request.raw_scores, permit) + .predict_inner( + request.inputs, + request.truncate, + truncation_direction, + request.raw_scores, + permit, + ) .await?; let headers = HeaderMap::from(metadata); @@ -743,8 +770,15 @@ impl grpc::predict_server::Predict for TextEmbeddingsService { .try_acquire_permit() .map_err(ErrorResponse::from)?; + let truncation_direction = convert_truncation_direction(request.truncation_direction); let (response, metadata) = self - .predict_inner(inputs, request.truncate, request.raw_scores, permit) + .predict_inner( + inputs, + request.truncate, + truncation_direction, + request.raw_scores, + permit, + ) .await?; let headers = HeaderMap::from(metadata); @@ -767,8 +801,15 @@ impl grpc::predict_server::Predict for TextEmbeddingsService { // Clone for move below let clone = self.clone(); let function = |req: PredictRequest, permit: OwnedSemaphorePermit| async move { + let truncation_direction = convert_truncation_direction(req.truncation_direction); clone - .predict_inner(req.inputs, req.truncate, req.raw_scores, permit) + .predict_inner( + req.inputs, + req.truncate, + truncation_direction, + req.raw_scores, + permit, + ) .await }; @@ -800,8 +841,15 @@ impl grpc::predict_server::Predict for TextEmbeddingsService { } }; + let truncation_direction = convert_truncation_direction(req.truncation_direction); clone - .predict_inner(inputs, req.truncate, req.raw_scores, permit) + .predict_inner( + inputs, + req.truncate, + truncation_direction, + req.raw_scores, + permit, + ) .await }; @@ -862,12 +910,19 @@ impl grpc::rerank_server::Rerank for TextEmbeddingsService { let rerank_inner = move |query: String, text: String, truncate: bool, + truncation_direction: tokenizers::TruncationDirection, raw_scores: bool, infer: Infer| async move { let permit = infer.acquire_permit().await; let response = infer - .predict((query, text), truncate, raw_scores, permit) + .predict( + (query, text), + truncate, + truncation_direction, + raw_scores, + permit, + ) .await .map_err(ErrorResponse::from)?; @@ -902,6 +957,7 @@ impl grpc::rerank_server::Rerank for TextEmbeddingsService { let mut futures = Vec::with_capacity(batch_size); let query_chars = request.query.chars().count(); let mut total_compute_chars = query_chars * batch_size; + let truncation_direction = convert_truncation_direction(request.truncation_direction); for text in &request.texts { total_compute_chars += text.chars().count(); @@ -910,6 +966,7 @@ impl grpc::rerank_server::Rerank for TextEmbeddingsService { request.query.clone(), text.clone(), request.truncate, + truncation_direction, request.raw_scores, local_infer, )) @@ -1027,11 +1084,18 @@ impl grpc::rerank_server::Rerank for TextEmbeddingsService { query: String, text: String, truncate: bool, + truncation_direction: tokenizers::TruncationDirection, raw_scores: bool, infer: Infer, permit: OwnedSemaphorePermit| async move { let response = infer - .predict((query, text.clone()), truncate, raw_scores, permit) + .predict( + (query, text.clone()), + truncate, + truncation_direction, + raw_scores, + permit, + ) .await .map_err(ErrorResponse::from)?; @@ -1055,7 +1119,14 @@ impl grpc::rerank_server::Rerank for TextEmbeddingsService { // Create bounded channel to have an upper bound of spawned tasks // We will have at most `max_parallel_stream_requests` messages from this stream in the queue let (rerank_sender, mut rerank_receiver) = mpsc::channel::<( - (usize, String, String, bool, bool), + ( + usize, + String, + String, + bool, + tokenizers::TruncationDirection, + bool, + ), oneshot::Sender< Result<(usize, usize, Duration, Duration, Duration, f32, String), ErrorResponse>, >, @@ -1066,8 +1137,10 @@ impl grpc::rerank_server::Rerank for TextEmbeddingsService { // Background task that uses the bounded channel tokio::spawn(async move { - while let Some(((index, query, text, truncate, raw_scores), mut sender)) = - rerank_receiver.recv().await + while let Some(( + (index, query, text, truncate, truncation_direction, raw_scores), + mut sender, + )) = rerank_receiver.recv().await { // Wait on permit before spawning the task to avoid creating more tasks than needed let permit = local_infer.acquire_permit().await; @@ -1079,7 +1152,7 @@ impl grpc::rerank_server::Rerank for TextEmbeddingsService { tokio::spawn(async move { // Select on closed to cancel work if the stream was closed tokio::select! { - result = rerank_inner(index, query, text, truncate, raw_scores, task_infer, permit) => { + result = rerank_inner(index, query, text, truncate, truncation_direction, raw_scores, task_infer, permit) => { let _ = sender.send(result); } _ = sender.closed() => {} @@ -1118,6 +1191,7 @@ impl grpc::rerank_server::Rerank for TextEmbeddingsService { total_compute_chars += request.query.chars().count(); total_compute_chars += request.text.chars().count(); + let truncation_direction = convert_truncation_direction(request.truncation_direction); rerank_sender .send(( ( @@ -1125,6 +1199,7 @@ impl grpc::rerank_server::Rerank for TextEmbeddingsService { request.query, request.text, request.truncate, + truncation_direction, raw_scores.unwrap(), ), result_sender, @@ -1434,3 +1509,10 @@ impl From for Status { Status::new(code, value.error) } } + +fn convert_truncation_direction(value: i32) -> tokenizers::TruncationDirection { + match TruncationDirection::try_from(value).expect("Unexpected enum value") { + TruncationDirection::Right => tokenizers::TruncationDirection::Right, + TruncationDirection::Left => tokenizers::TruncationDirection::Left, + } +} diff --git a/router/src/http/server.rs b/router/src/http/server.rs index ddf5d2bd..3e23991f 100644 --- a/router/src/http/server.rs +++ b/router/src/http/server.rs @@ -30,6 +30,7 @@ use text_embeddings_core::infer::{ AllEmbeddingsInferResponse, Infer, InferMetadata, PooledEmbeddingsInferResponse, }; use text_embeddings_core::TextEmbeddingsError; +use tokenizers::TruncationDirection; use tokio::sync::OwnedSemaphorePermit; use tower_http::cors::{AllowOrigin, CorsLayer}; use tracing::instrument; @@ -103,7 +104,6 @@ async fn predict( // Closure for predict let predict_inner = move |inputs: Sequence, truncate: bool, - raw_scores: bool, infer: Infer, info: Info, permit: Option| async move { @@ -113,7 +113,13 @@ async fn predict( }; let response = infer - .predict(inputs, truncate, raw_scores, permit) + .predict( + inputs, + truncate, + req.truncation_direction, + req.raw_scores, + permit, + ) .await .map_err(ErrorResponse::from)?; @@ -159,15 +165,8 @@ async fn predict( let compute_chars = inputs.count_chars(); let permit = infer.try_acquire_permit().map_err(ErrorResponse::from)?; - let (prompt_tokens, tokenization, queue, inference, predictions) = predict_inner( - inputs, - truncate, - req.raw_scores, - infer.0, - info.0, - Some(permit), - ) - .await?; + let (prompt_tokens, tokenization, queue, inference, predictions) = + predict_inner(inputs, truncate, infer.0, info.0, Some(permit)).await?; metrics::increment_counter!("te_request_success", "method" => "single"); @@ -211,7 +210,6 @@ async fn predict( futures.push(predict_inner( input, truncate, - req.raw_scores, local_infer.0, local_info.0, None, @@ -321,15 +319,17 @@ async fn rerank( })?; // Closure for rerank - let rerank_inner = move |query: String, - text: String, - truncate: bool, - raw_scores: bool, - infer: Infer| async move { + let rerank_inner = move |query: String, text: String, truncate: bool, infer: Infer| async move { let permit = infer.acquire_permit().await; let response = infer - .predict((query, text), truncate, raw_scores, permit) + .predict( + (query, text), + truncate, + req.truncation_direction, + req.raw_scores, + permit, + ) .await .map_err(ErrorResponse::from)?; @@ -375,7 +375,6 @@ async fn rerank( req.query.clone(), text.clone(), truncate, - req.raw_scores, local_infer.0, )) } @@ -484,7 +483,13 @@ async fn embed( let permit = infer.try_acquire_permit().map_err(ErrorResponse::from)?; let response = infer - .embed_pooled(input, truncate, req.normalize, permit) + .embed_pooled( + input, + truncate, + req.truncation_direction, + req.normalize, + permit, + ) .await .map_err(ErrorResponse::from)?; @@ -541,7 +546,13 @@ async fn embed( futures.push(async move { let permit = local_infer.acquire_permit().await; local_infer - .embed_pooled(input, truncate, req.normalize, permit) + .embed_pooled( + input, + truncate, + req.truncation_direction, + req.normalize, + permit, + ) .await }) } @@ -641,7 +652,7 @@ async fn embed_sparse( let permit = infer.try_acquire_permit().map_err(ErrorResponse::from)?; let response = infer - .embed_sparse(input, truncate, permit) + .embed_sparse(input, truncate, req.truncation_direction, permit) .await .map_err(ErrorResponse::from)?; @@ -697,7 +708,9 @@ async fn embed_sparse( let local_infer = infer.clone(); futures.push(async move { let permit = local_infer.acquire_permit().await; - let response = local_infer.embed_sparse(input, truncate, permit).await?; + let response = local_infer + .embed_sparse(input, truncate, req.truncation_direction, permit) + .await?; Ok((sparsify(response.results), response.metadata)) }) } @@ -789,7 +802,7 @@ async fn embed_all( let permit = infer.try_acquire_permit().map_err(ErrorResponse::from)?; let response = infer - .embed_all(input, truncate, permit) + .embed_all(input, truncate, req.truncation_direction, permit) .await .map_err(ErrorResponse::from)?; @@ -845,7 +858,9 @@ async fn embed_all( let local_infer = infer.clone(); futures.push(async move { let permit = local_infer.acquire_permit().await; - local_infer.embed_all(input, truncate, permit).await + local_infer + .embed_all(input, truncate, req.truncation_direction, permit) + .await }) } let results = join_all(futures) @@ -936,7 +951,7 @@ async fn openai_embed( let permit = infer.try_acquire_permit().map_err(ErrorResponse::from)?; let response = infer - .embed_pooled(input, truncate, true, permit) + .embed_pooled(input, truncate, TruncationDirection::Right, true, permit) .await .map_err(ErrorResponse::from)?; @@ -997,7 +1012,7 @@ async fn openai_embed( futures.push(async move { let permit = local_infer.acquire_permit().await; local_infer - .embed_pooled(input, truncate, true, permit) + .embed_pooled(input, truncate, TruncationDirection::Right, true, permit) .await }) } diff --git a/router/src/http/types.rs b/router/src/http/types.rs index be514d40..8655d00d 100644 --- a/router/src/http/types.rs +++ b/router/src/http/types.rs @@ -4,6 +4,7 @@ use serde::{de, Deserialize, Deserializer, Serialize}; use serde_json::json; use std::fmt::Formatter; use text_embeddings_core::tokenization::EncodingInput; +use tokenizers::TruncationDirection; use utoipa::openapi::{RefOr, Schema}; use utoipa::ToSchema; @@ -199,6 +200,9 @@ pub(crate) struct PredictRequest { #[schema(default = "false", example = "false", nullable = true)] pub truncate: Option, #[serde(default)] + #[schema(default = "right", example = "right")] + pub truncation_direction: TruncationDirection, + #[serde(default)] #[schema(default = "false", example = "false")] pub raw_scores: bool, } @@ -228,6 +232,9 @@ pub(crate) struct RerankRequest { #[schema(default = "false", example = "false", nullable = true)] pub truncate: Option, #[serde(default)] + #[schema(default = "right", example = "right")] + pub truncation_direction: TruncationDirection, + #[serde(default)] #[schema(default = "false", example = "false")] pub raw_scores: bool, #[serde(default)] @@ -323,6 +330,9 @@ pub(crate) struct EmbedRequest { #[serde(default)] #[schema(default = "false", example = "false", nullable = true)] pub truncate: Option, + #[serde(default)] + #[schema(default = "right", example = "right")] + pub truncation_direction: TruncationDirection, #[serde(default = "default_normalize")] #[schema(default = "true", example = "true")] pub normalize: bool, @@ -342,6 +352,9 @@ pub(crate) struct EmbedSparseRequest { #[serde(default)] #[schema(default = "false", example = "false", nullable = true)] pub truncate: Option, + #[serde(default)] + #[schema(default = "right", example = "right")] + pub truncation_direction: TruncationDirection, } #[derive(Serialize, ToSchema)] @@ -359,6 +372,9 @@ pub(crate) struct EmbedAllRequest { #[serde(default)] #[schema(default = "false", example = "false", nullable = true)] pub truncate: Option, + #[serde(default)] + #[schema(default = "right", example = "right")] + pub truncation_direction: TruncationDirection, } #[derive(Serialize, ToSchema)]