From e4f3d138493a59bb76a6d7371913c3768b48f1eb Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 16:42:14 +0100
Subject: [PATCH] Correclty type ASR output

---
 .../automatic-speech-recognition/inference.ts | 16 ++++++++++++++++
 .../spec/output.json                          | 19 +++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index 244b44b69..ee17e64f4 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -45,11 +45,27 @@ export interface GenerationParameters {
 	temperature?: number;
 	[property: string]: unknown;
 }
+export interface AutomaticSpeechRecognitionOutputChunk {
+	/**
+	 * A chunk of text identified by the model
+	 */
+	text: string;
+	/**
+	 * The start and end timestamps corresponding with the text
+	 */
+	timestamps: number[];
+	[property: string]: unknown;
+}
 export type AutomaticSpeechRecognitionOutput = AutomaticSpeechRecognitionOutputElement[];
 /**
  * Outputs of inference for the Automatic Speech Recognition task
  */
 export interface AutomaticSpeechRecognitionOutputElement {
+	/**
+	 * When returnTimestamps is enabled, chunks contains a list of audio chunks identified by
+	 * the model.
+	 */
+	chunks?: AutomaticSpeechRecognitionOutputChunk[];
 	/**
 	 * The recognized text.
 	 */
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
index 72573986d..217f210b1 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
@@ -10,6 +10,25 @@
 			"text": {
 				"type": "string",
 				"description": "The recognized text."
+			},
+			"chunks": {
+				"type": "array",
+				"description": "When returnTimestamps is enabled, chunks contains a list of audio chunks identified by the model.",
+				"items": {
+					"type": "object",
+					"title": "AutomaticSpeechRecognitionOutputChunk",
+					"properties": {
+						"text": { "type": "string", "description": "A chunk of text identified by the model" },
+						"timestamps": {
+							"type": "array",
+							"description": "The start and end timestamps corresponding with the text",
+							"items": { "type": "number" },
+							"minLength": 2,
+							"maxLength": 2
+						}
+					},
+					"required": ["text", "timestamps"]
+				}
 			}
 		},
 		"required": ["text"]