From e4f3d138493a59bb76a6d7371913c3768b48f1eb Mon Sep 17 00:00:00 2001 From: SBrandeis Date: Fri, 26 Jan 2024 16:42:14 +0100 Subject: [PATCH] Correclty type ASR output --- .../automatic-speech-recognition/inference.ts | 16 ++++++++++++++++ .../spec/output.json | 19 +++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts index 244b44b69..ee17e64f4 100644 --- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts +++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts @@ -45,11 +45,27 @@ export interface GenerationParameters { temperature?: number; [property: string]: unknown; } +export interface AutomaticSpeechRecognitionOutputChunk { + /** + * A chunk of text identified by the model + */ + text: string; + /** + * The start and end timestamps corresponding with the text + */ + timestamps: number[]; + [property: string]: unknown; +} export type AutomaticSpeechRecognitionOutput = AutomaticSpeechRecognitionOutputElement[]; /** * Outputs of inference for the Automatic Speech Recognition task */ export interface AutomaticSpeechRecognitionOutputElement { + /** + * When returnTimestamps is enabled, chunks contains a list of audio chunks identified by + * the model. + */ + chunks?: AutomaticSpeechRecognitionOutputChunk[]; /** * The recognized text. */ diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json index 72573986d..217f210b1 100644 --- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json +++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json @@ -10,6 +10,25 @@ "text": { "type": "string", "description": "The recognized text." + }, + "chunks": { + "type": "array", + "description": "When returnTimestamps is enabled, chunks contains a list of audio chunks identified by the model.", + "items": { + "type": "object", + "title": "AutomaticSpeechRecognitionOutputChunk", + "properties": { + "text": { "type": "string", "description": "A chunk of text identified by the model" }, + "timestamps": { + "type": "array", + "description": "The start and end timestamps corresponding with the text", + "items": { "type": "number" }, + "minLength": 2, + "maxLength": 2 + } + }, + "required": ["text", "timestamps"] + } } }, "required": ["text"]