wip: spec generate parameters

huggingface · Jan 26, 2024 · 826181a · 826181a
1 parent e4f3d13
commit 826181a
Show file tree

Hide file tree

Showing 5 changed files with 387 additions and 6 deletions.
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -40,11 +40,91 @@ export interface AutomaticSpeechRecognitionParameters {
  */
 export interface GenerationParameters {
 	/**
-	 * I can be the papa you'd be the mama
+	 * Whether to use sampling instead of greedy decoding when generating new tokens.
+	 */
+	doSample?: boolean;
+	/**
+	 * Controls the stopping condition for beam-based methods.
+	 */
+	earlyStopping?: EarlyStoppingUnion;
+	/**
+	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
+	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+	 */
+	epsilonCutoff?: number;
+	/**
+	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+	 * float strictly between 0 and 1, a token is only considered if it is greater than either
+	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+	 * for more details.
+	 */
+	etaCutoff?: number;
+	/**
+	 * The maximum length (in tokens) of the generated text, including the input.
+	 */
+	maxLength?: number;
+	/**
+	 * The maximum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The minimum length (in tokens) of the generated text, including the input.
+	 */
+	minLength?: number;
+	/**
+	 * The minimum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	minNewTokens?: number;
+	/**
+	 * Number of groups to divide num_beams into in order to ensure diversity among different
+	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+	 */
+	numBeamGroups?: number;
+	/**
+	 * Number of beams to use for beam search.
+	 */
+	numBeams?: number;
+	/**
+	 * The value balances the model confidence and the degeneration penalty in contrastive
+	 * search decoding.
+	 */
+	penaltyAlpha?: number;
+	/**
+	 * The value used to modulate the next token probabilities.
 	 */
 	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
+	 * that add up to top_p or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Local typicality measures how similar the conditional probability of predicting a target
+	 * token next is to the expected conditional probability of predicting a random token next,
+	 * given the partial text already generated. If set to float < 1, the smallest set of the
+	 * most locally typical tokens with probabilities that add up to typical_p or higher are
+	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+	 */
+	typicalP?: number;
+	/**
+	 * Whether the model should use the past last key/values attentions to speed up decoding
+	 */
+	useCache?: boolean;
 	[property: string]: unknown;
 }
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+export type EarlyStoppingUnion = boolean | "never";
 export interface AutomaticSpeechRecognitionOutputChunk {
 	/**
 	 * A chunk of text identified by the model

diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -40,11 +40,91 @@ export interface ImageToTextParameters {
  */
 export interface GenerationParameters {
 	/**
-	 * I can be the papa you'd be the mama
+	 * Whether to use sampling instead of greedy decoding when generating new tokens.
+	 */
+	doSample?: boolean;
+	/**
+	 * Controls the stopping condition for beam-based methods.
+	 */
+	earlyStopping?: EarlyStoppingUnion;
+	/**
+	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
+	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+	 */
+	epsilonCutoff?: number;
+	/**
+	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+	 * float strictly between 0 and 1, a token is only considered if it is greater than either
+	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+	 * for more details.
+	 */
+	etaCutoff?: number;
+	/**
+	 * The maximum length (in tokens) of the generated text, including the input.
+	 */
+	maxLength?: number;
+	/**
+	 * The maximum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The minimum length (in tokens) of the generated text, including the input.
+	 */
+	minLength?: number;
+	/**
+	 * The minimum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	minNewTokens?: number;
+	/**
+	 * Number of groups to divide num_beams into in order to ensure diversity among different
+	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+	 */
+	numBeamGroups?: number;
+	/**
+	 * Number of beams to use for beam search.
+	 */
+	numBeams?: number;
+	/**
+	 * The value balances the model confidence and the degeneration penalty in contrastive
+	 * search decoding.
+	 */
+	penaltyAlpha?: number;
+	/**
+	 * The value used to modulate the next token probabilities.
 	 */
 	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
+	 * that add up to top_p or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Local typicality measures how similar the conditional probability of predicting a target
+	 * token next is to the expected conditional probability of predicting a random token next,
+	 * given the partial text already generated. If set to float < 1, the smallest set of the
+	 * most locally typical tokens with probabilities that add up to typical_p or higher are
+	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+	 */
+	typicalP?: number;
+	/**
+	 * Whether the model should use the past last key/values attentions to speed up decoding
+	 */
+	useCache?: boolean;
 	[property: string]: unknown;
 }
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+export type EarlyStoppingUnion = boolean | "never";
 export type ImageToTextOutput = ImageToTextOutputElement[];
 /**
  * Outputs of inference for the Image To Text task

diff --git a/packages/tasks/src/tasks/schema-utils.json b/packages/tasks/src/tasks/schema-utils.json
@@ -1,7 +1,7 @@
 {
 	"$id": "/inference/schemas/schema-utils.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "Common type definitions shared by several tasks",
+	"description": "(Incomplete!) Common type definitions shared by several tasks",
 	"definitions": {
 		"GenerationParameters": {
 			"title": "GenerationParameters",
@@ -10,7 +10,67 @@
 			"properties": {
 				"temperature": {
 					"type": "number",
-					"description": "I can be the papa you'd be the mama"
+					"description": "The value used to modulate the next token probabilities."
+				},
+				"topK": {
+					"type": "integer",
+					"description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
+				},
+				"topP": {
+					"type": "number",
+					"description": "If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation."
+				},
+				"typicalP": {
+					"type": "number",
+					"description": " Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to typical_p or higher are kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details."
+				},
+				"epsilonCutoff": {
+					"type": "number",
+					"description": "If set to float strictly between 0 and 1, only tokens with a conditional probability greater than epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
+				},
+				"etaCutoff": {
+					"type": "number",
+					"description": "Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
+				},
+				"maxLength": {
+					"type": "integer",
+					"description": "The maximum length (in tokens) of the generated text, including the input."
+				},
+				"maxNewTokens": {
+					"type": "integer",
+					"description": "The maximum number of tokens to generate. Takes precedence over maxLength."
+				},
+				"minLength": {
+					"type": "integer",
+					"description": "The minimum length (in tokens) of the generated text, including the input."
+				},
+				"minNewTokens": {
+					"type": "integer",
+					"description": "The minimum number of tokens to generate. Takes precedence over maxLength."
+				},
+				"doSample": {
+					"type": "boolean",
+					"description": "Whether to use sampling instead of greedy decoding when generating new tokens."
+				},
+				"earlyStopping": {
+					"description": "Controls the stopping condition for beam-based methods.",
+					"oneOf": [{ "type": "boolean" }, { "const": "never", "type": "string" }]
+				},
+				"numBeams": {
+					"type": "integer",
+					"description": "Number of beams to use for beam search."
+				},
+				"numBeamGroups": {
+					"type": "integer",
+					"description": "Number of groups to divide num_beams into in order to ensure diversity among different groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details."
+				},
+				"penaltyAlpha": {
+					"type": "number",
+					"description": "The value balances the model confidence and the degeneration penalty in contrastive search decoding."
+				},
+				"useCache": {
+					"type": "boolean",
+					"description": "Whether the model should use the past last key/values attentions to speed up decoding"
 				}
 			}
 		}

diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -36,11 +36,91 @@ export interface TextToAudioParameters {
  */
 export interface GenerationParameters {
 	/**
-	 * I can be the papa you'd be the mama
+	 * Whether to use sampling instead of greedy decoding when generating new tokens.
+	 */
+	doSample?: boolean;
+	/**
+	 * Controls the stopping condition for beam-based methods.
+	 */
+	earlyStopping?: EarlyStoppingUnion;
+	/**
+	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
+	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+	 */
+	epsilonCutoff?: number;
+	/**
+	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+	 * float strictly between 0 and 1, a token is only considered if it is greater than either
+	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+	 * for more details.
+	 */
+	etaCutoff?: number;
+	/**
+	 * The maximum length (in tokens) of the generated text, including the input.
+	 */
+	maxLength?: number;
+	/**
+	 * The maximum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The minimum length (in tokens) of the generated text, including the input.
+	 */
+	minLength?: number;
+	/**
+	 * The minimum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	minNewTokens?: number;
+	/**
+	 * Number of groups to divide num_beams into in order to ensure diversity among different
+	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+	 */
+	numBeamGroups?: number;
+	/**
+	 * Number of beams to use for beam search.
+	 */
+	numBeams?: number;
+	/**
+	 * The value balances the model confidence and the degeneration penalty in contrastive
+	 * search decoding.
+	 */
+	penaltyAlpha?: number;
+	/**
+	 * The value used to modulate the next token probabilities.
 	 */
 	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
+	 * that add up to top_p or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Local typicality measures how similar the conditional probability of predicting a target
+	 * token next is to the expected conditional probability of predicting a random token next,
+	 * given the partial text already generated. If set to float < 1, the smallest set of the
+	 * most locally typical tokens with probabilities that add up to typical_p or higher are
+	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+	 */
+	typicalP?: number;
+	/**
+	 * Whether the model should use the past last key/values attentions to speed up decoding
+	 */
+	useCache?: boolean;
 	[property: string]: unknown;
 }
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+export type EarlyStoppingUnion = boolean | "never";
 export type TextToAudioOutput = TextToAudioOutputElement[];
 /**
  * Outputs of inference for the Text To Audio task