From 7c50482ed66060b4a6863bffaa931199b5785f21 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 15:29:09 +0100
Subject: [PATCH 01/51] add JSON schema spec for audio-classification

---
 .../audio-classification/spec/input.json      | 30 +++++++++++++++++++
 .../audio-classification/spec/output.json     | 24 +++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 packages/tasks/src/tasks/audio-classification/spec/input.json
 create mode 100644 packages/tasks/src/tasks/audio-classification/spec/output.json

diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
new file mode 100644
index 000000000..b0b8757b1
--- /dev/null
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -0,0 +1,30 @@
+{
+    "id": "http://huggingface.co/inference/schemas/audio-classification/input.json",
+    "$schema": "http://json-schema.org/draft-06/schema#",
+    "description": "Inputs for Audio Classification inference",
+    "type": "object",
+    "properties": {
+        "inputs": {
+            "description": "On or several audio files to classify"
+        },
+        "parameters": {
+            "description": "Additional inference parameters",
+            "$ref": "#/definitions/AudioClassificationParameters"
+        }
+    },
+    "definitions": {
+        "AudioClassificationParameters": {
+            "description": "Additional inference parameters for Audio Classification",
+            "type": "object",
+            "properties": {
+                "topK": {
+                    "type": "integer",
+                    "description": "When specified, limits the output to the top K most probable classes."
+                }
+            }
+        }
+    },
+    "required": [
+        "inputs"
+    ]
+}
\ No newline at end of file
diff --git a/packages/tasks/src/tasks/audio-classification/spec/output.json b/packages/tasks/src/tasks/audio-classification/spec/output.json
new file mode 100644
index 000000000..dfe0f0c5e
--- /dev/null
+++ b/packages/tasks/src/tasks/audio-classification/spec/output.json
@@ -0,0 +1,24 @@
+{
+    "id": "http://huggingface.co/inference/schemas/audio-classification/output.schema.json",
+    "$schema": "http://json-schema.org/draft-06/schema#",
+    "title": "AudioClassificationOutput",
+    "description": "Outputs for Audio Classification inference",
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "label": {
+                "type": "string",
+                "description": "The predicted class label (model specific)."
+            },
+            "score": {
+                "type": "number",
+                "description": "The corresponding probability."
+            }
+        },
+        "required": [
+            "label",
+            "score"
+        ]
+    }
+}
\ No newline at end of file

From fd98112b14fe8d7c89fe4a386dd938509bb37f7a Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 15:31:29 +0100
Subject: [PATCH 02/51] add JSON schema spec for text-generation

---
 .../src/tasks/text-generation/spec/input.json | 84 +++++++++++++++++++
 .../tasks/text-generation/spec/output.json    | 18 ++++
 2 files changed, 102 insertions(+)
 create mode 100644 packages/tasks/src/tasks/text-generation/spec/input.json
 create mode 100644 packages/tasks/src/tasks/text-generation/spec/output.json

diff --git a/packages/tasks/src/tasks/text-generation/spec/input.json b/packages/tasks/src/tasks/text-generation/spec/input.json
new file mode 100644
index 000000000..e0e73dd68
--- /dev/null
+++ b/packages/tasks/src/tasks/text-generation/spec/input.json
@@ -0,0 +1,84 @@
+{
+    "id": "http://huggingface.co/inference/schemas/text-generation/input.json",
+    "$schema": "http://json-schema.org/draft-06/schema#",
+    "description": "Inputs for Text Generation inference",
+    "type": "object",
+    "properties": {
+        "inputs": {
+            "description": "The text to initialize generation with",
+            "anyOf": [
+                {
+                    "type": "string"
+                },
+                {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                }
+            ]
+        },
+        "parameters": {
+            "description": "Additional inference parameters",
+            "$ref": "#/definitions/TextGenerationParameters"
+        }
+    },
+    "definitions": {
+        "TextGenerationParameters": {
+            "description": "Additional inference parameters for Text Generation",
+            "type": "object",
+            "properties": {
+                "doSample": {
+                    "type": "boolean",
+                    "description": "Whether to use logit sampling (true) or greedy search (false)."
+                },
+                "maxNewTokens": {
+                    "type": "integer",
+                    "description": "Maximum number of generated tokens."
+                },
+                "repetitionPenalty": {
+                    "type": "number",
+                    "description": "The parameter for repetition penalty. A value of 1.0 means no penalty. See [this paper](https://hf.co/papers/1909.05858) for more details."
+                },
+                "returnFullText": {
+                    "type": "boolean",
+                    "description": "Whether to prepend the prompt to the generated text."
+                },
+                "stopSequences": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    },
+                    "description": "Stop generating tokens if a member of `stop_sequences` is generated."
+                },
+                "temperature": {
+                    "type": "number",
+                    "description": "The value used to modulate the logits distribution."
+                },
+                "topK": {
+                    "type": "integer",
+                    "description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
+                },
+                "topP": {
+                    "type": "number",
+                    "description": "If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation."
+                },
+                "truncate": {
+                    "type": "integer",
+                    "description": "Truncate input tokens to the given size."
+                },
+                "typicalP": {
+                    "type": "number",
+                    "description": "Typical Decoding mass. See [Typical Decoding for Natural Language Generation](https://hf.co/papers/2202.00666) for more information"
+                },
+                "watermark": {
+                    "type": "boolean",
+                    "description": "Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)"
+                }
+            }
+        }
+    },
+    "required": [
+        "inputs"
+    ]
+}
\ No newline at end of file
diff --git a/packages/tasks/src/tasks/text-generation/spec/output.json b/packages/tasks/src/tasks/text-generation/spec/output.json
new file mode 100644
index 000000000..163653942
--- /dev/null
+++ b/packages/tasks/src/tasks/text-generation/spec/output.json
@@ -0,0 +1,18 @@
+{
+    "id": "http://huggingface.co/inference/schemas/text-generation/output.json",
+    "$schema": "http://json-schema.org/draft-06/schema#",
+    "description": "Outputs for Text Generation inference",
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "generatedText": {
+                "type": "string",
+                "description": "The generated text"
+            }
+        },
+        "required": [
+            "generatedText"
+        ]
+    }
+}
\ No newline at end of file

From 352e7c5f2fb75a6585f0f684ba940f69a3ebffd6 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 16:00:07 +0100
Subject: [PATCH 03/51] =?UTF-8?q?=E2=9C=A8=20Add=20script=20to=20generate?=
 =?UTF-8?q?=20inference=20types?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/package.json                   |  12 +-
 packages/tasks/pnpm-lock.yaml                 | 207 ++++++++++++++++++
 .../tasks/src/scripts/inference-codegen.ts    | 103 +++++++++
 3 files changed, 318 insertions(+), 4 deletions(-)
 create mode 100644 packages/tasks/src/scripts/inference-codegen.ts

diff --git a/packages/tasks/package.json b/packages/tasks/package.json
index 0917d7975..d538c6487 100644
--- a/packages/tasks/package.json
+++ b/packages/tasks/package.json
@@ -24,9 +24,10 @@
 		"format": "prettier --write .",
 		"format:check": "prettier --check .",
 		"prepublishOnly": "pnpm run build",
-		"build": "tsup src/index.ts --format cjs,esm --clean --dts",
+		"build": "tsup src/index.ts src/scripts/**.ts --format cjs,esm --clean --dts",
 		"prepare": "pnpm run build",
-		"check": "tsc"
+		"check": "tsc",
+		"inference-codegen": "pnpm run build && node dist/scripts/inference-codegen.js"
 	},
 	"files": [
 		"dist",
@@ -40,5 +41,8 @@
 	],
 	"author": "Hugging Face",
 	"license": "MIT",
-	"devDependencies": {}
-}
+	"devDependencies": {
+		"@types/node": "^20.11.5",
+		"quicktype-core": "^23.0.81"
+	}
+}
\ No newline at end of file
diff --git a/packages/tasks/pnpm-lock.yaml b/packages/tasks/pnpm-lock.yaml
index 2b9f1883a..fedbbb7c3 100644
--- a/packages/tasks/pnpm-lock.yaml
+++ b/packages/tasks/pnpm-lock.yaml
@@ -3,3 +3,210 @@ lockfileVersion: '6.0'
 settings:
   autoInstallPeers: true
   excludeLinksFromLockfile: false
+
+devDependencies:
+  '@types/node':
+    specifier: ^20.11.5
+    version: 20.11.5
+  quicktype-core:
+    specifier: ^23.0.81
+    version: 23.0.81
+
+packages:
+
+  /@glideapps/ts-necessities@2.1.3:
+    resolution: {integrity: sha512-q9U8v/n9qbkd2zDYjuX3qtlbl+OIyI9zF+zQhZjfYOE9VMDH7tfcUSJ9p0lXoY3lxmGFne09yi4iiNeQUwV7AA==}
+    dev: true
+
+  /@types/node@20.11.5:
+    resolution: {integrity: sha512-g557vgQjUUfN76MZAN/dt1z3dzcUsimuysco0KeluHgrPdJXkP/XdAURgyO2W9fZWHRtRBiVKzKn8vyOAwlG+w==}
+    dependencies:
+      undici-types: 5.26.5
+    dev: true
+
+  /@types/urijs@1.19.25:
+    resolution: {integrity: sha512-XOfUup9r3Y06nFAZh3WvO0rBU4OtlfPB/vgxpjg+NRdGU6CN6djdc6OEiH+PcqHCY6eFLo9Ista73uarf4gnBg==}
+    dev: true
+
+  /abort-controller@3.0.0:
+    resolution: {integrity: sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==}
+    engines: {node: '>=6.5'}
+    dependencies:
+      event-target-shim: 5.0.1
+    dev: true
+
+  /base64-js@1.5.1:
+    resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==}
+    dev: true
+
+  /browser-or-node@2.1.1:
+    resolution: {integrity: sha512-8CVjaLJGuSKMVTxJ2DpBl5XnlNDiT4cQFeuCJJrvJmts9YrTZDizTX7PjC2s6W4x+MBGZeEY6dGMrF04/6Hgqg==}
+    dev: true
+
+  /buffer@6.0.3:
+    resolution: {integrity: sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==}
+    dependencies:
+      base64-js: 1.5.1
+      ieee754: 1.2.1
+    dev: true
+
+  /collection-utils@1.0.1:
+    resolution: {integrity: sha512-LA2YTIlR7biSpXkKYwwuzGjwL5rjWEZVOSnvdUc7gObvWe4WkjxOpfrdhoP7Hs09YWDVfg0Mal9BpAqLfVEzQg==}
+    dev: true
+
+  /cross-fetch@4.0.0:
+    resolution: {integrity: sha512-e4a5N8lVvuLgAWgnCrLr2PP0YyDOTHa9H/Rj54dirp61qXnNq46m82bRhNqIA5VccJtWBvPTFRV3TtvHUKPB1g==}
+    dependencies:
+      node-fetch: 2.7.0
+    transitivePeerDependencies:
+      - encoding
+    dev: true
+
+  /event-target-shim@5.0.1:
+    resolution: {integrity: sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==}
+    engines: {node: '>=6'}
+    dev: true
+
+  /events@3.3.0:
+    resolution: {integrity: sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==}
+    engines: {node: '>=0.8.x'}
+    dev: true
+
+  /ieee754@1.2.1:
+    resolution: {integrity: sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==}
+    dev: true
+
+  /is-url@1.2.4:
+    resolution: {integrity: sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==}
+    dev: true
+
+  /js-base64@3.7.5:
+    resolution: {integrity: sha512-3MEt5DTINKqfScXKfJFrRbxkrnk2AxPWGBL/ycjz4dK8iqiSJ06UxD8jh8xuh6p10TX4t2+7FsBYVxxQbMg+qA==}
+    dev: true
+
+  /lodash@4.17.21:
+    resolution: {integrity: sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==}
+    dev: true
+
+  /node-fetch@2.7.0:
+    resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==}
+    engines: {node: 4.x || >=6.0.0}
+    peerDependencies:
+      encoding: ^0.1.0
+    peerDependenciesMeta:
+      encoding:
+        optional: true
+    dependencies:
+      whatwg-url: 5.0.0
+    dev: true
+
+  /pako@0.2.9:
+    resolution: {integrity: sha512-NUcwaKxUxWrZLpDG+z/xZaCgQITkA/Dv4V/T6bw7VON6l1Xz/VnrBqrYjZQ12TamKHzITTfOEIYUj48y2KXImA==}
+    dev: true
+
+  /pako@1.0.11:
+    resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==}
+    dev: true
+
+  /pluralize@8.0.0:
+    resolution: {integrity: sha512-Nc3IT5yHzflTfbjgqWcCPpo7DaKy4FnpB0l/zCAW0Tc7jxAiuqSxHasntB3D7887LSrA93kDJ9IXovxJYxyLCA==}
+    engines: {node: '>=4'}
+    dev: true
+
+  /process@0.11.10:
+    resolution: {integrity: sha512-cdGef/drWFoydD1JsMzuFf8100nZl+GT+yacc2bEced5f9Rjk4z+WtFUTBu9PhOi9j/jfmBPu0mMEY4wIdAF8A==}
+    engines: {node: '>= 0.6.0'}
+    dev: true
+
+  /quicktype-core@23.0.81:
+    resolution: {integrity: sha512-iJQpCEzSQIkffJPS5NC+0w+Rq9faGgz09L+WIbseu1toFfj+M/3KTG5jhzdY/uN88fWosAom2fMoEADA403+rQ==}
+    dependencies:
+      '@glideapps/ts-necessities': 2.1.3
+      '@types/urijs': 1.19.25
+      browser-or-node: 2.1.1
+      collection-utils: 1.0.1
+      cross-fetch: 4.0.0
+      is-url: 1.2.4
+      js-base64: 3.7.5
+      lodash: 4.17.21
+      pako: 1.0.11
+      pluralize: 8.0.0
+      readable-stream: 4.4.2
+      unicode-properties: 1.4.1
+      urijs: 1.19.11
+      wordwrap: 1.0.0
+      yaml: 2.3.4
+    transitivePeerDependencies:
+      - encoding
+    dev: true
+
+  /readable-stream@4.4.2:
+    resolution: {integrity: sha512-Lk/fICSyIhodxy1IDK2HazkeGjSmezAWX2egdtJnYhtzKEsBPJowlI6F6LPb5tqIQILrMbx22S5o3GuJavPusA==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    dependencies:
+      abort-controller: 3.0.0
+      buffer: 6.0.3
+      events: 3.3.0
+      process: 0.11.10
+      string_decoder: 1.3.0
+    dev: true
+
+  /safe-buffer@5.2.1:
+    resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==}
+    dev: true
+
+  /string_decoder@1.3.0:
+    resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==}
+    dependencies:
+      safe-buffer: 5.2.1
+    dev: true
+
+  /tiny-inflate@1.0.3:
+    resolution: {integrity: sha512-pkY1fj1cKHb2seWDy0B16HeWyczlJA9/WW3u3c4z/NiWDsO3DOU5D7nhTLE9CF0yXv/QZFY7sEJmj24dK+Rrqw==}
+    dev: true
+
+  /tr46@0.0.3:
+    resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==}
+    dev: true
+
+  /undici-types@5.26.5:
+    resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==}
+    dev: true
+
+  /unicode-properties@1.4.1:
+    resolution: {integrity: sha512-CLjCCLQ6UuMxWnbIylkisbRj31qxHPAurvena/0iwSVbQ2G1VY5/HjV0IRabOEbDHlzZlRdCrD4NhB0JtU40Pg==}
+    dependencies:
+      base64-js: 1.5.1
+      unicode-trie: 2.0.0
+    dev: true
+
+  /unicode-trie@2.0.0:
+    resolution: {integrity: sha512-x7bc76x0bm4prf1VLg79uhAzKw8DVboClSN5VxJuQ+LKDOVEW9CdH+VY7SP+vX7xCYQqzzgQpFqz15zeLvAtZQ==}
+    dependencies:
+      pako: 0.2.9
+      tiny-inflate: 1.0.3
+    dev: true
+
+  /urijs@1.19.11:
+    resolution: {integrity: sha512-HXgFDgDommxn5/bIv0cnQZsPhHDA90NPHD6+c/v21U5+Sx5hoP8+dP9IZXBU1gIfvdRfhG8cel9QNPeionfcCQ==}
+    dev: true
+
+  /webidl-conversions@3.0.1:
+    resolution: {integrity: sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==}
+    dev: true
+
+  /whatwg-url@5.0.0:
+    resolution: {integrity: sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==}
+    dependencies:
+      tr46: 0.0.3
+      webidl-conversions: 3.0.1
+    dev: true
+
+  /wordwrap@1.0.0:
+    resolution: {integrity: sha512-gvVzJFlPycKc5dZN4yPkP8w7Dc37BtP1yczEneOb4uq34pXZcvrtRTmWV8W+Ume+XCxKgbjM+nevkyFPMybd4Q==}
+    dev: true
+
+  /yaml@2.3.4:
+    resolution: {integrity: sha512-8aAvwVUSHpfEqTQ4w/KMlf3HcRdt50E5ODIQJBw1fQ5RL34xabzxtUlzTXVqc4rkZsPbvrXKWnABCD7kWSmocA==}
+    engines: {node: '>= 14'}
+    dev: true
diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
new file mode 100644
index 000000000..5aef9fcd3
--- /dev/null
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -0,0 +1,103 @@
+import type {
+    SerializedRenderResult
+} from "quicktype-core";
+import {
+    quicktype,
+    InputData,
+    JSONSchemaInput,
+    FetchingJSONSchemaStore
+
+} from "quicktype-core";
+import * as fs from "fs/promises";
+import { existsSync as pathExists } from "fs";
+import * as path from "path";
+
+const TYPESCRIPT_HEADER_FILE = `
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ * 
+ * Generated on ${new Date().toISOString()}
+ */
+
+`
+
+
+const rootDirFinder = function (): string {
+    const parts = __dirname.split("/");
+    let level = parts.length - 1;
+    while (level > 0) {
+        const currentPath = parts.slice(0, level).join("/");
+        console.debug(currentPath);
+        try {
+            require(`${currentPath}/package.json`);
+            return path.normalize(currentPath);
+        } catch (err) {
+            /// noop
+        }
+        level--;
+    }
+    return "";
+};
+
+async function buildInputData(taskId: string, taskSpecDir: string): Promise<InputData> {
+    const schema = new JSONSchemaInput(new FetchingJSONSchemaStore());
+    await schema.addSource({ name: `${taskId}-input`, schema: await fs.readFile(`${taskSpecDir}/input.json`, { encoding: "utf-8" }) });
+    await schema.addSource({ name: `${taskId}-output`, schema: await fs.readFile(`${taskSpecDir}/output.json`, { encoding: "utf-8" }) });
+    const inputData = new InputData();
+    inputData.addInput(schema);
+    return inputData;
+}
+
+
+async function generateTypescript(inputData: InputData): Promise<SerializedRenderResult> {
+    return await quicktype({
+        inputData,
+        lang: "typescript",
+        alphabetizeProperties: true,
+        rendererOptions: {
+            "just-types": true,
+            "nice-property-names": true,
+            "prefer-unions": true,
+            "prefer-const-values": true,
+        }
+    });
+}
+
+async function main() {
+    const rootDir = rootDirFinder();
+    const tasksDir = path.join(rootDir, "src", "tasks")
+    const allTasks = await Promise.all(
+        (await fs.readdir(tasksDir, { withFileTypes: true }))
+            .filter(entry => entry.isDirectory())
+            .map(async entry => ({ task: entry.name, dirPath: path.join(entry.path, entry.name) }))
+    );
+
+    for (const { task, dirPath } of allTasks) {
+        const taskSpecDir = path.join(dirPath, "spec")
+        if (!pathExists(taskSpecDir)) {
+            console.debug(`No spec found for task ${task} - skipping`);
+            continue
+        }
+        console.debug(`✨ Generating types for task`, task)
+
+        console.debug("   📦 Building input data")
+        const inputData = await buildInputData(task, taskSpecDir);
+
+        console.debug("   🏭 Generating typescript code")
+        {
+            const { lines } = await generateTypescript(inputData);
+            await fs.writeFile(`${dirPath}/inference.ts`, [TYPESCRIPT_HEADER_FILE, ...lines].join(`\n`), { flag: "w+", encoding: "utf-8" });
+        }
+
+
+    }
+    console.debug("✅ All done!")
+}
+
+let exit = 0;
+main()
+    .catch(err => {
+        console.error("Failure", err);
+        exit = 1;
+    })
+    .finally(() => process.exit(exit));
\ No newline at end of file

From 5551f5b98263f1f0457095fbb9b7606fc6317f64 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 16:00:33 +0100
Subject: [PATCH 04/51] Add generated code

---
 .../tasks/audio-classification/inference.ts   | 50 +++++++++++
 .../src/tasks/text-generation/inference.ts    | 89 +++++++++++++++++++
 2 files changed, 139 insertions(+)
 create mode 100644 packages/tasks/src/tasks/audio-classification/inference.ts
 create mode 100644 packages/tasks/src/tasks/text-generation/inference.ts

diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
new file mode 100644
index 000000000..ea5e9076e
--- /dev/null
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -0,0 +1,50 @@
+
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ * 
+ * Generated on 2024-01-19T14:59:10.562Z
+ */
+
+
+/**
+ * Inputs for Audio Classification inference
+ */
+export interface AudioClassificationInput {
+    /**
+     * On or several audio files to classify
+     */
+    inputs: any;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: AudioClassificationParameters;
+    [property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Audio Classification
+ */
+export interface AudioClassificationParameters {
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: any;
+}
+
+/**
+ * Outputs for Audio Classification inference
+ */
+export interface AudioClassificationOutput {
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
new file mode 100644
index 000000000..ef9661482
--- /dev/null
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -0,0 +1,89 @@
+
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ * 
+ * Generated on 2024-01-19T14:59:10.562Z
+ */
+
+
+/**
+ * Inputs for Text Generation inference
+ */
+export interface TextGenerationInput {
+    /**
+     * The text to initialize generation with
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TextGenerationParameters;
+    [property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text Generation
+ */
+export interface TextGenerationParameters {
+    /**
+     * Whether to use logit sampling (true) or greedy search (false).
+     */
+    doSample?: boolean;
+    /**
+     * Maximum number of generated tokens.
+     */
+    maxNewTokens?: number;
+    /**
+     * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
+     * paper](https://hf.co/papers/1909.05858) for more details.
+     */
+    repetitionPenalty?: number;
+    /**
+     * Whether to prepend the prompt to the generated text.
+     */
+    returnFullText?: boolean;
+    /**
+     * Stop generating tokens if a member of `stop_sequences` is generated.
+     */
+    stopSequences?: string[];
+    /**
+     * The value used to modulate the logits distribution.
+     */
+    temperature?: number;
+    /**
+     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+     */
+    topK?: number;
+    /**
+     * If set to < 1, only the smallest set of most probable tokens with probabilities that add
+     * up to `top_p` or higher are kept for generation.
+     */
+    topP?: number;
+    /**
+     * Truncate input tokens to the given size.
+     */
+    truncate?: number;
+    /**
+     * Typical Decoding mass. See [Typical Decoding for Natural Language
+     * Generation](https://hf.co/papers/2202.00666) for more information
+     */
+    typicalP?: number;
+    /**
+     * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
+     */
+    watermark?: boolean;
+    [property: string]: any;
+}
+
+/**
+ * Outputs for Text Generation inference
+ */
+export interface TextGenerationOutput {
+    /**
+     * The generated text
+     */
+    generatedText: string;
+    [property: string]: any;
+}

From fad594b648e1b61fadcb1a0f97aa0028c2697b90 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 16:04:07 +0100
Subject: [PATCH 05/51] =?UTF-8?q?=F0=9F=92=84format=20with=20pnpm?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/package.json                   |   2 +-
 .../tasks/src/scripts/inference-codegen.ts    | 145 ++++++++--------
 .../tasks/audio-classification/inference.ts   |  50 +++---
 .../audio-classification/spec/input.json      |  56 +++---
 .../audio-classification/spec/output.json     |  43 +++--
 .../src/tasks/text-generation/inference.ts    | 128 +++++++-------
 .../src/tasks/text-generation/spec/input.json | 164 +++++++++---------
 .../tasks/text-generation/spec/output.json    |  32 ++--
 8 files changed, 302 insertions(+), 318 deletions(-)

diff --git a/packages/tasks/package.json b/packages/tasks/package.json
index d538c6487..258679aba 100644
--- a/packages/tasks/package.json
+++ b/packages/tasks/package.json
@@ -45,4 +45,4 @@
 		"@types/node": "^20.11.5",
 		"quicktype-core": "^23.0.81"
 	}
-}
\ No newline at end of file
+}
diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index 5aef9fcd3..6edc31fea 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -1,13 +1,5 @@
-import type {
-    SerializedRenderResult
-} from "quicktype-core";
-import {
-    quicktype,
-    InputData,
-    JSONSchemaInput,
-    FetchingJSONSchemaStore
-
-} from "quicktype-core";
+import type { SerializedRenderResult } from "quicktype-core";
+import { quicktype, InputData, JSONSchemaInput, FetchingJSONSchemaStore } from "quicktype-core";
 import * as fs from "fs/promises";
 import { existsSync as pathExists } from "fs";
 import * as path from "path";
@@ -19,85 +11,90 @@ const TYPESCRIPT_HEADER_FILE = `
  * Generated on ${new Date().toISOString()}
  */
 
-`
-
+`;
 
 const rootDirFinder = function (): string {
-    const parts = __dirname.split("/");
-    let level = parts.length - 1;
-    while (level > 0) {
-        const currentPath = parts.slice(0, level).join("/");
-        console.debug(currentPath);
-        try {
-            require(`${currentPath}/package.json`);
-            return path.normalize(currentPath);
-        } catch (err) {
-            /// noop
-        }
-        level--;
-    }
-    return "";
+	const parts = __dirname.split("/");
+	let level = parts.length - 1;
+	while (level > 0) {
+		const currentPath = parts.slice(0, level).join("/");
+		console.debug(currentPath);
+		try {
+			require(`${currentPath}/package.json`);
+			return path.normalize(currentPath);
+		} catch (err) {
+			/// noop
+		}
+		level--;
+	}
+	return "";
 };
 
 async function buildInputData(taskId: string, taskSpecDir: string): Promise<InputData> {
-    const schema = new JSONSchemaInput(new FetchingJSONSchemaStore());
-    await schema.addSource({ name: `${taskId}-input`, schema: await fs.readFile(`${taskSpecDir}/input.json`, { encoding: "utf-8" }) });
-    await schema.addSource({ name: `${taskId}-output`, schema: await fs.readFile(`${taskSpecDir}/output.json`, { encoding: "utf-8" }) });
-    const inputData = new InputData();
-    inputData.addInput(schema);
-    return inputData;
+	const schema = new JSONSchemaInput(new FetchingJSONSchemaStore());
+	await schema.addSource({
+		name: `${taskId}-input`,
+		schema: await fs.readFile(`${taskSpecDir}/input.json`, { encoding: "utf-8" }),
+	});
+	await schema.addSource({
+		name: `${taskId}-output`,
+		schema: await fs.readFile(`${taskSpecDir}/output.json`, { encoding: "utf-8" }),
+	});
+	const inputData = new InputData();
+	inputData.addInput(schema);
+	return inputData;
 }
 
-
 async function generateTypescript(inputData: InputData): Promise<SerializedRenderResult> {
-    return await quicktype({
-        inputData,
-        lang: "typescript",
-        alphabetizeProperties: true,
-        rendererOptions: {
-            "just-types": true,
-            "nice-property-names": true,
-            "prefer-unions": true,
-            "prefer-const-values": true,
-        }
-    });
+	return await quicktype({
+		inputData,
+		lang: "typescript",
+		alphabetizeProperties: true,
+		rendererOptions: {
+			"just-types": true,
+			"nice-property-names": true,
+			"prefer-unions": true,
+			"prefer-const-values": true,
+		},
+	});
 }
 
 async function main() {
-    const rootDir = rootDirFinder();
-    const tasksDir = path.join(rootDir, "src", "tasks")
-    const allTasks = await Promise.all(
-        (await fs.readdir(tasksDir, { withFileTypes: true }))
-            .filter(entry => entry.isDirectory())
-            .map(async entry => ({ task: entry.name, dirPath: path.join(entry.path, entry.name) }))
-    );
-
-    for (const { task, dirPath } of allTasks) {
-        const taskSpecDir = path.join(dirPath, "spec")
-        if (!pathExists(taskSpecDir)) {
-            console.debug(`No spec found for task ${task} - skipping`);
-            continue
-        }
-        console.debug(`✨ Generating types for task`, task)
-
-        console.debug("   📦 Building input data")
-        const inputData = await buildInputData(task, taskSpecDir);
+	const rootDir = rootDirFinder();
+	const tasksDir = path.join(rootDir, "src", "tasks");
+	const allTasks = await Promise.all(
+		(await fs.readdir(tasksDir, { withFileTypes: true }))
+			.filter((entry) => entry.isDirectory())
+			.map(async (entry) => ({ task: entry.name, dirPath: path.join(entry.path, entry.name) }))
+	);
 
-        console.debug("   🏭 Generating typescript code")
-        {
-            const { lines } = await generateTypescript(inputData);
-            await fs.writeFile(`${dirPath}/inference.ts`, [TYPESCRIPT_HEADER_FILE, ...lines].join(`\n`), { flag: "w+", encoding: "utf-8" });
-        }
+	for (const { task, dirPath } of allTasks) {
+		const taskSpecDir = path.join(dirPath, "spec");
+		if (!pathExists(taskSpecDir)) {
+			console.debug(`No spec found for task ${task} - skipping`);
+			continue;
+		}
+		console.debug(`✨ Generating types for task`, task);
 
+		console.debug("   📦 Building input data");
+		const inputData = await buildInputData(task, taskSpecDir);
 
-    }
-    console.debug("✅ All done!")
+		console.debug("   🏭 Generating typescript code");
+		{
+			const { lines } = await generateTypescript(inputData);
+			await fs.writeFile(`${dirPath}/inference.ts`, [TYPESCRIPT_HEADER_FILE, ...lines].join(`\n`), {
+				flag: "w+",
+				encoding: "utf-8",
+			});
+		}
+	}
+	console.debug("✅ All done!");
 }
 
 let exit = 0;
 main()
-    .catch(err => {
-        console.error("Failure", err);
-        exit = 1;
-    })
-    .finally(() => process.exit(exit));
\ No newline at end of file
+	.catch((err) => {
+		console.error("Failure", err);
+		exit = 1;
+	})
+	.finally(() => process.exit(exit));
diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index ea5e9076e..aa0e4e86c 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Generated on 2024-01-19T14:59:10.562Z
  */
 
-
 /**
  * Inputs for Audio Classification inference
  */
 export interface AudioClassificationInput {
-    /**
-     * On or several audio files to classify
-     */
-    inputs: any;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: AudioClassificationParameters;
-    [property: string]: any;
+	/**
+	 * On or several audio files to classify
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: AudioClassificationParameters;
+	[property: string]: any;
 }
 
 /**
@@ -27,24 +25,24 @@ export interface AudioClassificationInput {
  * Additional inference parameters for Audio Classification
  */
 export interface AudioClassificationParameters {
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: any;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: any;
 }
 
 /**
  * Outputs for Audio Classification inference
  */
 export interface AudioClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: any;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: any;
 }
diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index b0b8757b1..f2f3fbfbf 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -1,30 +1,28 @@
 {
-    "id": "http://huggingface.co/inference/schemas/audio-classification/input.json",
-    "$schema": "http://json-schema.org/draft-06/schema#",
-    "description": "Inputs for Audio Classification inference",
-    "type": "object",
-    "properties": {
-        "inputs": {
-            "description": "On or several audio files to classify"
-        },
-        "parameters": {
-            "description": "Additional inference parameters",
-            "$ref": "#/definitions/AudioClassificationParameters"
-        }
-    },
-    "definitions": {
-        "AudioClassificationParameters": {
-            "description": "Additional inference parameters for Audio Classification",
-            "type": "object",
-            "properties": {
-                "topK": {
-                    "type": "integer",
-                    "description": "When specified, limits the output to the top K most probable classes."
-                }
-            }
-        }
-    },
-    "required": [
-        "inputs"
-    ]
-}
\ No newline at end of file
+	"id": "http://huggingface.co/inference/schemas/audio-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Audio Classification inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "On or several audio files to classify"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/AudioClassificationParameters"
+		}
+	},
+	"definitions": {
+		"AudioClassificationParameters": {
+			"description": "Additional inference parameters for Audio Classification",
+			"type": "object",
+			"properties": {
+				"topK": {
+					"type": "integer",
+					"description": "When specified, limits the output to the top K most probable classes."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/audio-classification/spec/output.json b/packages/tasks/src/tasks/audio-classification/spec/output.json
index dfe0f0c5e..4985554db 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/output.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/output.json
@@ -1,24 +1,21 @@
 {
-    "id": "http://huggingface.co/inference/schemas/audio-classification/output.schema.json",
-    "$schema": "http://json-schema.org/draft-06/schema#",
-    "title": "AudioClassificationOutput",
-    "description": "Outputs for Audio Classification inference",
-    "type": "array",
-    "items": {
-        "type": "object",
-        "properties": {
-            "label": {
-                "type": "string",
-                "description": "The predicted class label (model specific)."
-            },
-            "score": {
-                "type": "number",
-                "description": "The corresponding probability."
-            }
-        },
-        "required": [
-            "label",
-            "score"
-        ]
-    }
-}
\ No newline at end of file
+	"id": "http://huggingface.co/inference/schemas/audio-classification/output.schema.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "AudioClassificationOutput",
+	"description": "Outputs for Audio Classification inference",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "The predicted class label (model specific)."
+			},
+			"score": {
+				"type": "number",
+				"description": "The corresponding probability."
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index ef9661482..2db6493ba 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Generated on 2024-01-19T14:59:10.562Z
  */
 
-
 /**
  * Inputs for Text Generation inference
  */
 export interface TextGenerationInput {
-    /**
-     * The text to initialize generation with
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: TextGenerationParameters;
-    [property: string]: any;
+	/**
+	 * The text to initialize generation with
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextGenerationParameters;
+	[property: string]: any;
 }
 
 /**
@@ -27,63 +25,63 @@ export interface TextGenerationInput {
  * Additional inference parameters for Text Generation
  */
 export interface TextGenerationParameters {
-    /**
-     * Whether to use logit sampling (true) or greedy search (false).
-     */
-    doSample?: boolean;
-    /**
-     * Maximum number of generated tokens.
-     */
-    maxNewTokens?: number;
-    /**
-     * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
-     * paper](https://hf.co/papers/1909.05858) for more details.
-     */
-    repetitionPenalty?: number;
-    /**
-     * Whether to prepend the prompt to the generated text.
-     */
-    returnFullText?: boolean;
-    /**
-     * Stop generating tokens if a member of `stop_sequences` is generated.
-     */
-    stopSequences?: string[];
-    /**
-     * The value used to modulate the logits distribution.
-     */
-    temperature?: number;
-    /**
-     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
-     */
-    topK?: number;
-    /**
-     * If set to < 1, only the smallest set of most probable tokens with probabilities that add
-     * up to `top_p` or higher are kept for generation.
-     */
-    topP?: number;
-    /**
-     * Truncate input tokens to the given size.
-     */
-    truncate?: number;
-    /**
-     * Typical Decoding mass. See [Typical Decoding for Natural Language
-     * Generation](https://hf.co/papers/2202.00666) for more information
-     */
-    typicalP?: number;
-    /**
-     * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
-     */
-    watermark?: boolean;
-    [property: string]: any;
+	/**
+	 * Whether to use logit sampling (true) or greedy search (false).
+	 */
+	doSample?: boolean;
+	/**
+	 * Maximum number of generated tokens.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
+	 * paper](https://hf.co/papers/1909.05858) for more details.
+	 */
+	repetitionPenalty?: number;
+	/**
+	 * Whether to prepend the prompt to the generated text.
+	 */
+	returnFullText?: boolean;
+	/**
+	 * Stop generating tokens if a member of `stop_sequences` is generated.
+	 */
+	stopSequences?: string[];
+	/**
+	 * The value used to modulate the logits distribution.
+	 */
+	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to < 1, only the smallest set of most probable tokens with probabilities that add
+	 * up to `top_p` or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Truncate input tokens to the given size.
+	 */
+	truncate?: number;
+	/**
+	 * Typical Decoding mass. See [Typical Decoding for Natural Language
+	 * Generation](https://hf.co/papers/2202.00666) for more information
+	 */
+	typicalP?: number;
+	/**
+	 * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
+	 */
+	watermark?: boolean;
+	[property: string]: any;
 }
 
 /**
  * Outputs for Text Generation inference
  */
 export interface TextGenerationOutput {
-    /**
-     * The generated text
-     */
-    generatedText: string;
-    [property: string]: any;
+	/**
+	 * The generated text
+	 */
+	generatedText: string;
+	[property: string]: any;
 }
diff --git a/packages/tasks/src/tasks/text-generation/spec/input.json b/packages/tasks/src/tasks/text-generation/spec/input.json
index e0e73dd68..08f038702 100644
--- a/packages/tasks/src/tasks/text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text-generation/spec/input.json
@@ -1,84 +1,82 @@
 {
-    "id": "http://huggingface.co/inference/schemas/text-generation/input.json",
-    "$schema": "http://json-schema.org/draft-06/schema#",
-    "description": "Inputs for Text Generation inference",
-    "type": "object",
-    "properties": {
-        "inputs": {
-            "description": "The text to initialize generation with",
-            "anyOf": [
-                {
-                    "type": "string"
-                },
-                {
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                }
-            ]
-        },
-        "parameters": {
-            "description": "Additional inference parameters",
-            "$ref": "#/definitions/TextGenerationParameters"
-        }
-    },
-    "definitions": {
-        "TextGenerationParameters": {
-            "description": "Additional inference parameters for Text Generation",
-            "type": "object",
-            "properties": {
-                "doSample": {
-                    "type": "boolean",
-                    "description": "Whether to use logit sampling (true) or greedy search (false)."
-                },
-                "maxNewTokens": {
-                    "type": "integer",
-                    "description": "Maximum number of generated tokens."
-                },
-                "repetitionPenalty": {
-                    "type": "number",
-                    "description": "The parameter for repetition penalty. A value of 1.0 means no penalty. See [this paper](https://hf.co/papers/1909.05858) for more details."
-                },
-                "returnFullText": {
-                    "type": "boolean",
-                    "description": "Whether to prepend the prompt to the generated text."
-                },
-                "stopSequences": {
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    },
-                    "description": "Stop generating tokens if a member of `stop_sequences` is generated."
-                },
-                "temperature": {
-                    "type": "number",
-                    "description": "The value used to modulate the logits distribution."
-                },
-                "topK": {
-                    "type": "integer",
-                    "description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
-                },
-                "topP": {
-                    "type": "number",
-                    "description": "If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation."
-                },
-                "truncate": {
-                    "type": "integer",
-                    "description": "Truncate input tokens to the given size."
-                },
-                "typicalP": {
-                    "type": "number",
-                    "description": "Typical Decoding mass. See [Typical Decoding for Natural Language Generation](https://hf.co/papers/2202.00666) for more information"
-                },
-                "watermark": {
-                    "type": "boolean",
-                    "description": "Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)"
-                }
-            }
-        }
-    },
-    "required": [
-        "inputs"
-    ]
-}
\ No newline at end of file
+	"id": "http://huggingface.co/inference/schemas/text-generation/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text Generation inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "The text to initialize generation with",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/TextGenerationParameters"
+		}
+	},
+	"definitions": {
+		"TextGenerationParameters": {
+			"description": "Additional inference parameters for Text Generation",
+			"type": "object",
+			"properties": {
+				"doSample": {
+					"type": "boolean",
+					"description": "Whether to use logit sampling (true) or greedy search (false)."
+				},
+				"maxNewTokens": {
+					"type": "integer",
+					"description": "Maximum number of generated tokens."
+				},
+				"repetitionPenalty": {
+					"type": "number",
+					"description": "The parameter for repetition penalty. A value of 1.0 means no penalty. See [this paper](https://hf.co/papers/1909.05858) for more details."
+				},
+				"returnFullText": {
+					"type": "boolean",
+					"description": "Whether to prepend the prompt to the generated text."
+				},
+				"stopSequences": {
+					"type": "array",
+					"items": {
+						"type": "string"
+					},
+					"description": "Stop generating tokens if a member of `stop_sequences` is generated."
+				},
+				"temperature": {
+					"type": "number",
+					"description": "The value used to modulate the logits distribution."
+				},
+				"topK": {
+					"type": "integer",
+					"description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
+				},
+				"topP": {
+					"type": "number",
+					"description": "If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation."
+				},
+				"truncate": {
+					"type": "integer",
+					"description": "Truncate input tokens to the given size."
+				},
+				"typicalP": {
+					"type": "number",
+					"description": "Typical Decoding mass. See [Typical Decoding for Natural Language Generation](https://hf.co/papers/2202.00666) for more information"
+				},
+				"watermark": {
+					"type": "boolean",
+					"description": "Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)"
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/text-generation/spec/output.json b/packages/tasks/src/tasks/text-generation/spec/output.json
index 163653942..ccbeaea20 100644
--- a/packages/tasks/src/tasks/text-generation/spec/output.json
+++ b/packages/tasks/src/tasks/text-generation/spec/output.json
@@ -1,18 +1,16 @@
 {
-    "id": "http://huggingface.co/inference/schemas/text-generation/output.json",
-    "$schema": "http://json-schema.org/draft-06/schema#",
-    "description": "Outputs for Text Generation inference",
-    "type": "array",
-    "items": {
-        "type": "object",
-        "properties": {
-            "generatedText": {
-                "type": "string",
-                "description": "The generated text"
-            }
-        },
-        "required": [
-            "generatedText"
-        ]
-    }
-}
\ No newline at end of file
+	"id": "http://huggingface.co/inference/schemas/text-generation/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs for Text Generation inference",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"generatedText": {
+				"type": "string",
+				"description": "The generated text"
+			}
+		},
+		"required": ["generatedText"]
+	}
+}

From 9a8f327840784d39c91dc6301b07c1a0c2e9f3ed Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 16:54:41 +0100
Subject: [PATCH 06/51] misc fix

---
 packages/tasks/src/tasks/audio-classification/spec/output.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/tasks/src/tasks/audio-classification/spec/output.json b/packages/tasks/src/tasks/audio-classification/spec/output.json
index 4985554db..ddacf5872 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/output.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/audio-classification/output.schema.json",
+	"id": "http://huggingface.co/inference/schemas/audio-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"title": "AudioClassificationOutput",
 	"description": "Outputs for Audio Classification inference",

From 02ba10c04add23abda5c37a9e8d8f95e8662d25e Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 17:13:08 +0100
Subject: [PATCH 07/51] =?UTF-8?q?=E2=9C=A8=20Add=20specs=20for=20existing?=
 =?UTF-8?q?=20tasks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../spec/input.json                           | 23 +++++
 .../spec/output.json                          | 16 ++++
 .../tasks/depth-estimation/spec/input.json    | 28 ++++++
 .../tasks/depth-estimation/spec/output.json   |  9 ++
 .../spec/input.json                           | 94 +++++++++++++++++++
 .../spec/output.json                          | 35 +++++++
 .../tasks/feature-extraction/spec/input.json  | 34 +++++++
 .../tasks/feature-extraction/spec/output.json | 58 ++++++++++++
 .../tasks/src/tasks/fill-mask/spec/input.json | 53 +++++++++++
 .../src/tasks/fill-mask/spec/output.json      | 28 ++++++
 .../image-classification/spec/input.json      | 28 ++++++
 .../image-classification/spec/output.json     | 20 ++++
 .../tasks/image-segmentation/spec/input.json  | 51 ++++++++++
 .../tasks/image-segmentation/spec/output.json | 20 ++++
 .../src/tasks/image-to-image/spec/input.json  | 21 +++++
 .../src/tasks/image-to-image/spec/output.json |  9 ++
 .../src/tasks/image-to-text/spec/input.json   | 28 ++++++
 .../src/tasks/image-to-text/spec/output.json  | 16 ++++
 .../tasks/object-detection/spec/input.json    | 28 ++++++
 .../tasks/object-detection/spec/output.json   | 44 +++++++++
 .../tasks/question-answering/spec/input.json  | 77 +++++++++++++++
 .../tasks/question-answering/spec/output.json | 28 ++++++
 .../table-question-answering/spec/input.json  | 44 +++++++++
 .../table-question-answering/spec/output.json | 39 ++++++++
 .../tasks/text-classification/spec/input.json | 54 +++++++++++
 .../text-classification/spec/output.json      | 20 ++++
 .../src/tasks/text-to-speech/spec/input.json  | 34 +++++++
 .../src/tasks/text-to-speech/spec/output.json | 19 ++++
 .../token-classification/spec/input.json      | 72 ++++++++++++++
 .../token-classification/spec/output.json     | 32 +++++++
 .../video-classification/spec/input.json      | 36 +++++++
 .../video-classification/spec/output.json     | 20 ++++
 .../visual-question-answering/spec/input.json | 51 ++++++++++
 .../spec/output.json                          | 20 ++++
 .../zero-shot-classification/spec/input.json  | 60 ++++++++++++
 .../zero-shot-classification/spec/output.json | 20 ++++
 .../spec/input.json                           | 55 +++++++++++
 .../spec/output.json                          | 20 ++++
 .../spec/input.json                           | 50 ++++++++++
 .../spec/output.json                          | 44 +++++++++
 40 files changed, 1438 insertions(+)
 create mode 100644 packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
 create mode 100644 packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
 create mode 100644 packages/tasks/src/tasks/depth-estimation/spec/input.json
 create mode 100644 packages/tasks/src/tasks/depth-estimation/spec/output.json
 create mode 100644 packages/tasks/src/tasks/document-question-answering/spec/input.json
 create mode 100644 packages/tasks/src/tasks/document-question-answering/spec/output.json
 create mode 100644 packages/tasks/src/tasks/feature-extraction/spec/input.json
 create mode 100644 packages/tasks/src/tasks/feature-extraction/spec/output.json
 create mode 100644 packages/tasks/src/tasks/fill-mask/spec/input.json
 create mode 100644 packages/tasks/src/tasks/fill-mask/spec/output.json
 create mode 100644 packages/tasks/src/tasks/image-classification/spec/input.json
 create mode 100644 packages/tasks/src/tasks/image-classification/spec/output.json
 create mode 100644 packages/tasks/src/tasks/image-segmentation/spec/input.json
 create mode 100644 packages/tasks/src/tasks/image-segmentation/spec/output.json
 create mode 100644 packages/tasks/src/tasks/image-to-image/spec/input.json
 create mode 100644 packages/tasks/src/tasks/image-to-image/spec/output.json
 create mode 100644 packages/tasks/src/tasks/image-to-text/spec/input.json
 create mode 100644 packages/tasks/src/tasks/image-to-text/spec/output.json
 create mode 100644 packages/tasks/src/tasks/object-detection/spec/input.json
 create mode 100644 packages/tasks/src/tasks/object-detection/spec/output.json
 create mode 100644 packages/tasks/src/tasks/question-answering/spec/input.json
 create mode 100644 packages/tasks/src/tasks/question-answering/spec/output.json
 create mode 100644 packages/tasks/src/tasks/table-question-answering/spec/input.json
 create mode 100644 packages/tasks/src/tasks/table-question-answering/spec/output.json
 create mode 100644 packages/tasks/src/tasks/text-classification/spec/input.json
 create mode 100644 packages/tasks/src/tasks/text-classification/spec/output.json
 create mode 100644 packages/tasks/src/tasks/text-to-speech/spec/input.json
 create mode 100644 packages/tasks/src/tasks/text-to-speech/spec/output.json
 create mode 100644 packages/tasks/src/tasks/token-classification/spec/input.json
 create mode 100644 packages/tasks/src/tasks/token-classification/spec/output.json
 create mode 100644 packages/tasks/src/tasks/video-classification/spec/input.json
 create mode 100644 packages/tasks/src/tasks/video-classification/spec/output.json
 create mode 100644 packages/tasks/src/tasks/visual-question-answering/spec/input.json
 create mode 100644 packages/tasks/src/tasks/visual-question-answering/spec/output.json
 create mode 100644 packages/tasks/src/tasks/zero-shot-classification/spec/input.json
 create mode 100644 packages/tasks/src/tasks/zero-shot-classification/spec/output.json
 create mode 100644 packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
 create mode 100644 packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
 create mode 100644 packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
 create mode 100644 packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json

diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
new file mode 100644
index 000000000..dfd1c4bdb
--- /dev/null
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -0,0 +1,23 @@
+{
+	"id": "http://huggingface.co/inference/schemas/automatic-speech-recognition/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Automatic Speech Recognition inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "The input audio data"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/AutomaticSpeechRecognitionParameters"
+		}
+	},
+	"definitions": {
+		"AutomaticSpeechRecognitionParameters": {
+			"description": "Additional inference parameters for Automatic Speech Recognition",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
new file mode 100644
index 000000000..e11153af6
--- /dev/null
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
@@ -0,0 +1,16 @@
+{
+	"id": "http://huggingface.co/inference/schemas/automatic-speech-recognition/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Automatic Speech Recognition task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"text": {
+				"type": "string",
+				"description": "The recognized text."
+			}
+		},
+		"required": ["text"]
+	}
+}
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/input.json b/packages/tasks/src/tasks/depth-estimation/spec/input.json
new file mode 100644
index 000000000..8483f13b5
--- /dev/null
+++ b/packages/tasks/src/tasks/depth-estimation/spec/input.json
@@ -0,0 +1,28 @@
+{
+	"id": "http://huggingface.co/inference/schemas/depth-estimation/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Depth Estimation inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "The input image data"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/DepthEstimationParameters"
+		}
+	},
+	"definitions": {
+		"DepthEstimationParameters": {
+			"description": "Additional inference parameters for Depth Estimation",
+			"type": "object",
+			"properties": {
+				"topK": {
+					"type": "integer",
+					"description": "When specified, limits the output to the top K most probable classes."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/output.json b/packages/tasks/src/tasks/depth-estimation/spec/output.json
new file mode 100644
index 000000000..643aaaa7b
--- /dev/null
+++ b/packages/tasks/src/tasks/depth-estimation/spec/output.json
@@ -0,0 +1,9 @@
+{
+	"id": "http://huggingface.co/inference/schemas/depth-estimation/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Depth Estimation task",
+	"type": "array",
+	"items": {
+		"description": "The output depth labels"
+	}
+}
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
new file mode 100644
index 000000000..dc72a24b2
--- /dev/null
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -0,0 +1,94 @@
+{
+	"id": "http://huggingface.co/inference/schemas/document-question-answering/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Document Question Answering inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "The ",
+			"anyOf": [
+				{
+					"$ref": "#/definitions/DocumentAndQuestion"
+				},
+				{
+					"type": "array",
+					"items": {
+						"$ref": "#/definitions/DocumentAndQuestion"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/DocumentQuestionAnsweringParameters"
+		}
+	},
+	"definitions": {
+		"DocumentQuestionAnsweringParameters": {
+			"description": "Additional inference parameters for Document Question Answering",
+			"type": "object",
+			"properties": {
+				"docStride": {
+					"type": "integer",
+					"description": "If the words in the document are too long to fit with the question for the model, it will be split in several chunks with some overlap. This argument controls the size of that overlap."
+				},
+				"handleImpossibleAnswer": {
+					"type": "boolean",
+					"description": "Whether to accept impossible as an answer"
+				},
+				"lang": {
+					"type": "string",
+					"description": "Language to use while running OCR. Defaults to english."
+				},
+				"maxAnswerLen": {
+					"type": "integer",
+					"description": "The maximum length of predicted answers (e.g., only answers with a shorter length are considered)."
+				},
+				"maxSeqLen": {
+					"type": "integer",
+					"description": "The maximum length of the total sentence (context + question) in tokens of each chunk passed to the model. The context will be split in several chunks (using doc_stride as overlap) if needed."
+				},
+				"maxQuestionLen": {
+					"type": "integer",
+					"description": "The maximum length of the question after tokenization. It will be truncated if needed."
+				},
+				"topK": {
+					"type": "integer",
+					"description": "The number of answers to return (will be chosen by order of likelihood). Can return less than top_k answers if there are not enough options available within the context."
+				},
+				"wordBoxes": {
+					"type": "array",
+					"description": "A list of words and bounding boxes (normalized 0->1000). If provided, the inference will skip the OCR step and use the provided bounding boxes instead.",
+					"items": {
+						"anyOf": [
+							{
+								"type": "string"
+							},
+							{
+								"type": "array",
+								"items": {
+									"type": "number"
+								},
+								"maxLength": 4,
+								"minLength": 4
+							}
+						]
+					}
+				}
+			}
+		},
+		"DocumentAndQuestion": {
+			"type": "object",
+			"properties": {
+				"image": {
+					"description": "The image on which the question is asked"
+				},
+				"question": {
+					"type": "string",
+					"description": "A question to ask of the document"
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/output.json b/packages/tasks/src/tasks/document-question-answering/spec/output.json
new file mode 100644
index 000000000..60f6b5314
--- /dev/null
+++ b/packages/tasks/src/tasks/document-question-answering/spec/output.json
@@ -0,0 +1,35 @@
+{
+	"id": "http://huggingface.co/inference/schemas/document-question-answering/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Document Question Answering task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"answer": {
+				"type": "string",
+				"description": "The answer to the question."
+			},
+			"score": {
+				"type": "number",
+				"description": "The probability associated to the answer."
+			},
+			"start": {
+				"type": "integer",
+				"descrtiption": "The start word index of the answer (in the OCR’d version of the input or provided word boxes)."
+			},
+			"end": {
+				"type": "integer",
+				"descrtiption": "The end word index of the answer (in the OCR’d version of the input or provided word boxes)."
+			},
+			"words": {
+				"type": "array",
+				"items": {
+					"type": "integer"
+				},
+				"description": "The index of each word/box pair that is in the answer"
+			}
+		},
+		"required": ["answer", "score", "start", "end", "words"]
+	}
+}
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/input.json b/packages/tasks/src/tasks/feature-extraction/spec/input.json
new file mode 100644
index 000000000..afa1ec998
--- /dev/null
+++ b/packages/tasks/src/tasks/feature-extraction/spec/input.json
@@ -0,0 +1,34 @@
+{
+	"id": "http://huggingface.co/inference/schemas/feature-extraction/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Feature Extraction inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several texts to get the features of",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/FeatureExtractionParameters"
+		}
+	},
+	"definitions": {
+		"FeatureExtractionParameters": {
+			"description": "Additional inference parameters for Feature Extraction",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/output.json b/packages/tasks/src/tasks/feature-extraction/spec/output.json
new file mode 100644
index 000000000..f2e0ce2bf
--- /dev/null
+++ b/packages/tasks/src/tasks/feature-extraction/spec/output.json
@@ -0,0 +1,58 @@
+{
+	"id": "http://huggingface.co/inference/schemas/feature-extraction/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Feature Extraction task",
+	"type": "array",
+	"items": {
+		"description": "The features computed by the mode, as a nested list of floats",
+		"$ref": "#/definitions/FeatureDimension"
+	},
+	"definitions": {
+		"FeatureDimension": {
+			"type": "array",
+			"items": {
+				"anyOf": [
+					{
+						"type": "number"
+					},
+					{
+						"type": "array",
+						"items": {
+							"anyOf": [
+								{
+									"type": "number"
+								},
+								{
+									"type": "array",
+									"items": {
+										"anyOf": [
+											{
+												"type": "number"
+											},
+											{
+												"type": "array",
+												"items": {
+													"anyOf": [
+														{
+															"type": "number"
+														},
+														{
+															"type": "array",
+															"items": {
+																"type": "number"
+															}
+														}
+													]
+												}
+											}
+										]
+									}
+								}
+							]
+						}
+					}
+				]
+			}
+		}
+	}
+}
diff --git a/packages/tasks/src/tasks/fill-mask/spec/input.json b/packages/tasks/src/tasks/fill-mask/spec/input.json
new file mode 100644
index 000000000..b0588e21a
--- /dev/null
+++ b/packages/tasks/src/tasks/fill-mask/spec/input.json
@@ -0,0 +1,53 @@
+{
+	"id": "http://huggingface.co/inference/schemas/fill-mask/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Fill Mask inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several texts with masked tokens",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/FillMaskParameters"
+		}
+	},
+	"definitions": {
+		"FillMaskParameters": {
+			"description": "Additional inference parameters for Fill Mask",
+			"type": "object",
+			"properties": {
+				"topK": {
+					"type": "integer",
+					"description": "When passed, overrides the number of predictions to return."
+				},
+				"targets": {
+					"anyOf": [
+						{
+							"type": "string"
+						},
+						{
+							"type": "array",
+							"items": {
+								"type": "string"
+							}
+						}
+					],
+					"description": "When passed, the model will limit the scores to the passed targets instead of looking up in the whole vocabulary. If the provided targets are not in the model vocab, they will be tokenized and the first resulting token will be used (with a warning, and that might be slower)."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/fill-mask/spec/output.json b/packages/tasks/src/tasks/fill-mask/spec/output.json
new file mode 100644
index 000000000..9ecf5aff8
--- /dev/null
+++ b/packages/tasks/src/tasks/fill-mask/spec/output.json
@@ -0,0 +1,28 @@
+{
+	"id": "http://huggingface.co/inference/schemas/fill-mask/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Fill Mask task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"sequence": {
+				"type": "string",
+				"description": "The corresponding input with the mask token prediction."
+			},
+			"score": {
+				"type": "number",
+				"description": "The corresponding probability"
+			},
+			"token": {
+				"type": "integer",
+				"description": "The predicted token id (to replace the masked one)."
+			},
+			"tokenStr": {
+				"type": "string",
+				"description": "The predicted token (to replace the masked one)."
+			}
+		},
+		"required": ["sequence", "score", "token", "tokenStr"]
+	}
+}
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
new file mode 100644
index 000000000..a9d09224b
--- /dev/null
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -0,0 +1,28 @@
+{
+	"id": "http://huggingface.co/inference/schemas/image-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Image Classification inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "On or several image files to classify"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/ImageClassificationParameters"
+		}
+	},
+	"definitions": {
+		"ImageClassificationParameters": {
+			"description": "Additional inference parameters for Image Classification",
+			"type": "object",
+			"properties": {
+				"topK": {
+					"type": "integer",
+					"description": "When specified, limits the output to the top K most probable classes."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/image-classification/spec/output.json b/packages/tasks/src/tasks/image-classification/spec/output.json
new file mode 100644
index 000000000..f48dc3e77
--- /dev/null
+++ b/packages/tasks/src/tasks/image-classification/spec/output.json
@@ -0,0 +1,20 @@
+{
+	"id": "http://huggingface.co/inference/schemas/image-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Image Classification task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "The predicted class label (model specific)."
+			},
+			"score": {
+				"type": "number",
+				"description": "The corresponding probability."
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/input.json b/packages/tasks/src/tasks/image-segmentation/spec/input.json
new file mode 100644
index 000000000..4063d6619
--- /dev/null
+++ b/packages/tasks/src/tasks/image-segmentation/spec/input.json
@@ -0,0 +1,51 @@
+{
+	"id": "http://huggingface.co/inference/schemas/image-segmentation/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Image Segmentation inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several image files to perform segmentation on"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/ImageSegmentationParameters"
+		}
+	},
+	"definitions": {
+		"ImageSegmentationParameters": {
+			"description": "Additional inference parameters for Image Segmentation",
+			"type": "object",
+			"properties": {
+				"maskThreshold": {
+					"type": "number",
+					"description": "Threshold to use when turning the predicted masks into binary values."
+				},
+				"overlapMaskAreaThreshold": {
+					"type": "number",
+					"description": "Mask overlap threshold to eliminate small, disconnected segments."
+				},
+				"subtask": {
+					"type": "string",
+					"description": "Segmentation task to be performed, depending on model capabilities.",
+					"oneOf": [
+						{
+							"const": "instance"
+						},
+						{
+							"const": "panoptic"
+						},
+						{
+							"const": "semantic"
+						}
+					]
+				},
+				"threshold": {
+					"type": "number",
+					"description": "Probability threshold to filter out predicted masks."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/output.json b/packages/tasks/src/tasks/image-segmentation/spec/output.json
new file mode 100644
index 000000000..694abf493
--- /dev/null
+++ b/packages/tasks/src/tasks/image-segmentation/spec/output.json
@@ -0,0 +1,20 @@
+{
+	"id": "http://huggingface.co/inference/schemas/image-segmentation/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Image Segmentation task",
+	"type": "array",
+	"items": {
+		"description": "A predicted mask / segment",
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "The label of the predicted segment"
+			},
+			"mask": {
+				"description": "The corresponding mask as a black-and-white image"
+			}
+		},
+		"required": ["label", "mask"]
+	}
+}
diff --git a/packages/tasks/src/tasks/image-to-image/spec/input.json b/packages/tasks/src/tasks/image-to-image/spec/input.json
new file mode 100644
index 000000000..2d2978c3a
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-image/spec/input.json
@@ -0,0 +1,21 @@
+{
+	"id": "http://huggingface.co/inference/schemas/image-to-image/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Image To Image inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or more images to generate images from"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/ImageToImageParameters"
+		}
+	},
+	"definitions": {
+		"ImageToImageParameters": {
+			"description": "Additional inference parameters for Image To Image"
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/image-to-image/spec/output.json b/packages/tasks/src/tasks/image-to-image/spec/output.json
new file mode 100644
index 000000000..0ec41e450
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-image/spec/output.json
@@ -0,0 +1,9 @@
+{
+	"id": "http://huggingface.co/inference/schemas/image-to-image/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Image To Image task",
+	"type": "array",
+	"items": {
+		"description": "The output image"
+	}
+}
diff --git a/packages/tasks/src/tasks/image-to-text/spec/input.json b/packages/tasks/src/tasks/image-to-text/spec/input.json
new file mode 100644
index 000000000..405521847
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-text/spec/input.json
@@ -0,0 +1,28 @@
+{
+	"id": "http://huggingface.co/inference/schemas/image-to-text/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Image To Text inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several images to generated text for"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/ImageToTextParameters"
+		}
+	},
+	"definitions": {
+		"ImageToTextParameters": {
+			"description": "Additional inference parameters for Image To Text",
+			"type": "object",
+			"properties": {
+				"maxNewTokens": {
+					"type": "integer",
+					"description": "The amount of maximum tokens to generate."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/image-to-text/spec/output.json b/packages/tasks/src/tasks/image-to-text/spec/output.json
new file mode 100644
index 000000000..0c0392b50
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-text/spec/output.json
@@ -0,0 +1,16 @@
+{
+	"id": "http://huggingface.co/inference/schemas/image-to-text/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Image To Text task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"generatedText": {
+				"type": "string",
+				"description": "The generated text."
+			}
+		},
+		"required": ["generatedText"]
+	}
+}
diff --git a/packages/tasks/src/tasks/object-detection/spec/input.json b/packages/tasks/src/tasks/object-detection/spec/input.json
new file mode 100644
index 000000000..7698570f6
--- /dev/null
+++ b/packages/tasks/src/tasks/object-detection/spec/input.json
@@ -0,0 +1,28 @@
+{
+	"id": "http://huggingface.co/inference/schemas/object-detection/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Object Detection inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several input images to perform object detection on"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/ObjectDetectionParameters"
+		}
+	},
+	"definitions": {
+		"ObjectDetectionParameters": {
+			"description": "Additional inference parameters for Object Detection",
+			"type": "object",
+			"properties": {
+				"threshold": {
+					"type": "number",
+					"description": "The probability necessary to make a prediction."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/object-detection/spec/output.json b/packages/tasks/src/tasks/object-detection/spec/output.json
new file mode 100644
index 000000000..ef46a2265
--- /dev/null
+++ b/packages/tasks/src/tasks/object-detection/spec/output.json
@@ -0,0 +1,44 @@
+{
+	"id": "http://huggingface.co/inference/schemas/object-detection/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Object Detection task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "The predicted label for the bounding box"
+			},
+			"score": {
+				"type": "number",
+				"description": "The associated score / probability"
+			},
+			"box": {
+				"$ref": "#/definitions/BoundingBox",
+				"description": "The predicted bounding box. Coordinates are relative to the top left corner of the input image."
+			}
+		},
+		"required": ["box", "label", "score"]
+	},
+	"definitions": {
+		"BoundingBox": {
+			"type": "object",
+			"properties": {
+				"xmin": {
+					"type": "integer"
+				},
+				"xmax": {
+					"type": "integer"
+				},
+				"ymin": {
+					"type": "integer"
+				},
+				"ymax": {
+					"type": "integer"
+				}
+			},
+			"required": ["xmin", "xmax", "ymin", "ymax"]
+		}
+	}
+}
diff --git a/packages/tasks/src/tasks/question-answering/spec/input.json b/packages/tasks/src/tasks/question-answering/spec/input.json
new file mode 100644
index 000000000..9f1737f03
--- /dev/null
+++ b/packages/tasks/src/tasks/question-answering/spec/input.json
@@ -0,0 +1,77 @@
+{
+	"id": "http://huggingface.co/inference/schemas/question-answering/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Question Answering inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several question+context pairs to answer",
+			"anyOf": [
+				{
+					"$ref": "#/definitions/QuestionAnsweringInput"
+				},
+				{
+					"type": "array",
+					"items": {
+						"$ref": "#/definitions/QuestionAnsweringInput"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/QuestionAnsweringParameters"
+		}
+	},
+	"definitions": {
+		"QuestionAnsweringInput": {
+			"type": "object",
+			"properties": {
+				"question": {
+					"type": "string",
+					"description": "The question to be answered"
+				},
+				"context": {
+					"type": "string",
+					"description": "The context to be used for answering the question"
+				}
+			},
+			"required": ["question", "context"]
+		},
+		"QuestionAnsweringParameters": {
+			"description": "Additional inference parameters for Question Answering",
+			"type": "object",
+			"properties": {
+				"topK": {
+					"type": "integer",
+					"description": "The number of answers to return (will be chosen by order of likelihood). Note that we return less than topk answers if there are not enough options available within the context."
+				},
+				"docStride": {
+					"type": "integer",
+					"description": "If the context is too long to fit with the question for the model, it will be split in several chunks with some overlap. This argument controls the size of that overlap."
+				},
+				"maxAnswerLen": {
+					"type": "integer",
+					"description": "The maximum length of predicted answers (e.g., only answers with a shorter length are considered)."
+				},
+				"maxSeqLen": {
+					"type": "integer",
+					"description": "The maximum length of the total sentence (context + question) in tokens of each chunk passed to the model. The context will be split in several chunks (using docStride as overlap) if needed."
+				},
+				"maxQuestionLen": {
+					"type": "integer",
+					"description": "The maximum length of the question after tokenization. It will be truncated if needed."
+				},
+				"handleImpossibleAnswer": {
+					"type": "boolean",
+					"description": "Whether to accept impossible as an answer."
+				},
+				"alignToWords": {
+					"type": "boolean",
+					"description": "Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt on non-space-separated languages (like Japanese or Chinese)"
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/question-answering/spec/output.json b/packages/tasks/src/tasks/question-answering/spec/output.json
new file mode 100644
index 000000000..eea7e8e51
--- /dev/null
+++ b/packages/tasks/src/tasks/question-answering/spec/output.json
@@ -0,0 +1,28 @@
+{
+	"id": "http://huggingface.co/inference/schemas/question-answering/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Question Answering task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"answer": {
+				"type": "string",
+				"description": "The answer to the question."
+			},
+			"score": {
+				"type": "number",
+				"description": "The probability associated to the answer."
+			},
+			"start": {
+				"type": "integer",
+				"description": "The character position in the input where the answer begins."
+			},
+			"end": {
+				"type": "integer",
+				"description": "The character position in the input where the answer ends."
+			}
+		},
+		"required": ["answer", "score", "start", "end"]
+	}
+}
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/input.json b/packages/tasks/src/tasks/table-question-answering/spec/input.json
new file mode 100644
index 000000000..8d35bcf71
--- /dev/null
+++ b/packages/tasks/src/tasks/table-question-answering/spec/input.json
@@ -0,0 +1,44 @@
+{
+	"id": "http://huggingface.co/inference/schemas/table-question-answering/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Table Question Answering inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several questions about a table",
+			"type": "object",
+			"properties": {
+				"table": {
+					"description": "The table to serve as context for the questions",
+					"type": "object"
+				},
+				"question": {
+					"description": "One or several questions to be answered about the table",
+					"anyOf": [
+						{
+							"type": "string"
+						},
+						{
+							"type": "array",
+							"items": {
+								"type": "string"
+							}
+						}
+					]
+				}
+			}
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/TableQuestionAnsweringParameters"
+		}
+	},
+	"definitions": {
+		"TableQuestionAnsweringParameters": {
+			"description": "Additional inference parameters for Table Question Answering",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/output.json b/packages/tasks/src/tasks/table-question-answering/spec/output.json
new file mode 100644
index 000000000..bb7969e3b
--- /dev/null
+++ b/packages/tasks/src/tasks/table-question-answering/spec/output.json
@@ -0,0 +1,39 @@
+{
+	"id": "http://huggingface.co/inference/schemas/table-question-answering/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Table Question Answering task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"answer": {
+				"type": "string",
+				"description": "The answer of the question given the table. If there is an aggregator, the answer will be preceded by `AGGREGATOR >`."
+			},
+			"coordinates": {
+				"type": "array",
+				"description": "Coordinates of the cells of the answers.",
+				"items": {
+					"type": "array",
+					"items": {
+						"type": "integer"
+					},
+					"minLength": 2,
+					"maxLength": 2
+				}
+			},
+			"cells": {
+				"type": "array",
+				"description": "List of strings made up of the answer cell values.",
+				"items": {
+					"type": "string"
+				}
+			},
+			"aggregator": {
+				"type": "string",
+				"description": "If the model has an aggregator, this returns the aggregator."
+			}
+		},
+		"required": ["answer", "cells", "coordinates"]
+	}
+}
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
new file mode 100644
index 000000000..b3b44d2ab
--- /dev/null
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -0,0 +1,54 @@
+{
+	"id": "http://huggingface.co/inference/schemas/text-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text Classification inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several texts to classify",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/TextClassificationParameters"
+		}
+	},
+	"definitions": {
+		"TextClassificationParameters": {
+			"description": "Additional inference parameters for Text Classification",
+			"type": "object",
+			"properties": {
+				"functionToApply": {
+					"type": "string",
+					"description": "The function to apply to the model outputs in order to retrieve the scores.",
+					"oneOf": [
+						{
+							"const": "sigmoid"
+						},
+						{
+							"const": "softmax"
+						},
+						{
+							"const": "none"
+						}
+					]
+				},
+				"topK": {
+					"type": "integer",
+					"description": "When specified, limits the output to the top K most probable classes."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/text-classification/spec/output.json b/packages/tasks/src/tasks/text-classification/spec/output.json
new file mode 100644
index 000000000..1c317ed02
--- /dev/null
+++ b/packages/tasks/src/tasks/text-classification/spec/output.json
@@ -0,0 +1,20 @@
+{
+	"id": "http://huggingface.co/inference/schemas/text-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text Classification task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "The predicted class label (model specific)."
+			},
+			"score": {
+				"type": "number",
+				"description": "The corresponding probability."
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/input.json b/packages/tasks/src/tasks/text-to-speech/spec/input.json
new file mode 100644
index 000000000..c0e94850e
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-speech/spec/input.json
@@ -0,0 +1,34 @@
+{
+	"id": "http://huggingface.co/inference/schemas/text-to-audio/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text To Audio inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several texts to generate audio for",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/TextToAudioParameters"
+		}
+	},
+	"definitions": {
+		"TextToAudioParameters": {
+			"description": "Additional inference parameters for Text To Audio",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/output.json b/packages/tasks/src/tasks/text-to-speech/spec/output.json
new file mode 100644
index 000000000..e8e92bbf8
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-speech/spec/output.json
@@ -0,0 +1,19 @@
+{
+	"id": "http://huggingface.co/inference/schemas/text-to-audio/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text To Audio task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"audio": {
+				"description": "The generated audio waveform."
+			},
+			"samplingRate": {
+				"type": "number",
+				"description": "The sampling rate of the generated audio waveform."
+			}
+		},
+		"required": ["audio", "samplingRate"]
+	}
+}
diff --git a/packages/tasks/src/tasks/token-classification/spec/input.json b/packages/tasks/src/tasks/token-classification/spec/input.json
new file mode 100644
index 000000000..d70c71a81
--- /dev/null
+++ b/packages/tasks/src/tasks/token-classification/spec/input.json
@@ -0,0 +1,72 @@
+{
+	"id": "http://huggingface.co/inference/schemas/token-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Token Classification inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several texts which tokens are to be classified",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/TokenClassificationParameters"
+		}
+	},
+	"definitions": {
+		"TokenClassificationParameters": {
+			"description": "Additional inference parameters for Token Classification",
+			"type": "object",
+			"properties": {
+				"ignoreLabels": {
+					"type": "array",
+					"items": {
+						"type": "string"
+					},
+					"description": "A list of labels to ignore"
+				},
+				"stride": {
+					"type": "integer",
+					"description": "The number of overlapping tokens between chunks when splitting the input text."
+				},
+				"aggregationStrategy": {
+					"type": "string",
+					"description": "The strategy used to fuse tokens based on model predictions",
+					"oneOf": [
+						{
+							"const": "none",
+							"description": "Do not aggregate tokens"
+						},
+						{
+							"const": "simple",
+							"description": "Group consecutive tokens with the same label in a single entity."
+						},
+						{
+							"const": "first",
+							"description": "Similar to \"simple\", also preserves word integrity (use the label predicted for the first token in a word)."
+						},
+						{
+							"const": "average",
+							"description": "Similar to \"simple\", also preserves word integrity (uses the label with the highest score, averaged across the word's tokens)."
+						},
+						{
+							"const": "max",
+							"description": "Similar to \"simple\", also preserves word integrity (uses the label with the highest score across the word's tokens)."
+						}
+					]
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/token-classification/spec/output.json b/packages/tasks/src/tasks/token-classification/spec/output.json
new file mode 100644
index 000000000..0e9a03768
--- /dev/null
+++ b/packages/tasks/src/tasks/token-classification/spec/output.json
@@ -0,0 +1,32 @@
+{
+	"id": "http://huggingface.co/inference/schemas/token-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Token Classification task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"entityGroup": {
+				"type": "string",
+				"description": "The predicted label for that group of tokens"
+			},
+			"score": {
+				"type": "number",
+				"description": "The associated score / probability"
+			},
+			"word": {
+				"type": "string",
+				"description": "The corresponding text"
+			},
+			"start": {
+				"type": "integer",
+				"description": "The character position in the input where this group begins."
+			},
+			"end": {
+				"type": "integer",
+				"description": "The character position in the input where this group ends."
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/video-classification/spec/input.json b/packages/tasks/src/tasks/video-classification/spec/input.json
new file mode 100644
index 000000000..9f58d0bf3
--- /dev/null
+++ b/packages/tasks/src/tasks/video-classification/spec/input.json
@@ -0,0 +1,36 @@
+{
+	"id": "http://huggingface.co/inference/schemas/video-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Video Classification inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several videos to be classified"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/VideoClassificationParameters"
+		}
+	},
+	"definitions": {
+		"VideoClassificationParameters": {
+			"description": "Additional inference parameters for Video Classification",
+			"type": "object",
+			"properties": {
+				"numFrames": {
+					"type": "integer",
+					"description": "The number of sampled frames to consider for classification."
+				},
+				"frameSamplingRate": {
+					"type": "integer",
+					"description": "The sampling rate used to select frames from the video."
+				},
+				"topK": {
+					"type": "integer",
+					"description": "When specified, limits the output to the top K most probable classes."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/video-classification/spec/output.json b/packages/tasks/src/tasks/video-classification/spec/output.json
new file mode 100644
index 000000000..aa4e369f1
--- /dev/null
+++ b/packages/tasks/src/tasks/video-classification/spec/output.json
@@ -0,0 +1,20 @@
+{
+	"id": "http://huggingface.co/inference/schemas/video-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Video Classification task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "The predicted class label (model specific)."
+			},
+			"score": {
+				"type": "number",
+				"description": "The corresponding probability."
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/input.json b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
new file mode 100644
index 000000000..134351b6b
--- /dev/null
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
@@ -0,0 +1,51 @@
+{
+	"id": "http://huggingface.co/inference/schemas/visual-question-answering/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Visual Question Answering inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or more image-question pairs",
+			"anyOf": [
+				{
+					"$ref": "#/definitions/VisualQuestionAnsweringInput"
+				},
+				{
+					"type": "array",
+					"items": {
+						"$ref": "#/definitions/VisualQuestionAnsweringInput"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/VisualQuestionAnsweringParameters"
+		}
+	},
+	"definitions": {
+		"VisualQuestionAnsweringInput": {
+			"type": "object",
+			"properties": {
+				"image": {
+					"description": "The image."
+				},
+				"question": {
+					"description": "The question to answer based on the image."
+				}
+			},
+			"required": ["question", "image"]
+		},
+		"VisualQuestionAnsweringParameters": {
+			"description": "Additional inference parameters for Visual Question Answering",
+			"type": "object",
+			"properties": {
+				"topK": {
+					"type": "integer",
+					"description": "The number of answers to return (will be chosen by order of likelihood). Note that we return less than topk answers if there are not enough options available within the context."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/output.json b/packages/tasks/src/tasks/visual-question-answering/spec/output.json
new file mode 100644
index 000000000..808957434
--- /dev/null
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/output.json
@@ -0,0 +1,20 @@
+{
+	"id": "http://huggingface.co/inference/schemas/visual-question-answering/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Visual Question Answering task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"answer": {
+				"type": "string",
+				"description": "The answer to the question"
+			},
+			"score": {
+				"type": "number",
+				"description": "The associated score / probability"
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
new file mode 100644
index 000000000..3682a9ddf
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
@@ -0,0 +1,60 @@
+{
+	"id": "http://huggingface.co/inference/schemas/zero-shot-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Zero Shot Classification inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several text + candidate labels pairs to classify",
+			"anyOf": [
+				{
+					"$ref": "#/definitions/ZeroShotClassificationInput"
+				},
+				{
+					"type": "array",
+					"items": {
+						"$ref": "#/definitions/ZeroShotClassificationInput"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/ZeroShotClassificationParameters"
+		}
+	},
+	"definitions": {
+		"ZeroShotClassificationInput": {
+			"type": "object",
+			"properties": {
+				"text": {
+					"type": "string",
+					"description": "The text to classify"
+				},
+				"candidateLabels": {
+					"type": "array",
+					"description": "The set of possible class labels to classify the text into.",
+					"items": {
+						"type": "string"
+					}
+				}
+			},
+			"required": ["text", "candidateLabels"]
+		},
+		"ZeroShotClassificationParameters": {
+			"description": "Additional inference parameters for Zero Shot Classification",
+			"type": "object",
+			"properties": {
+				"hypothesisTemplate": {
+					"type": "string",
+					"description": "The sentence used in conjunction with candidateLabels to attempt the text classification by replacing the placeholder with the candidate labels."
+				},
+				"multiLabel": {
+					"type": "boolean",
+					"description": "Whether multiple candidate labels can be true. If false, the scores are normalized such that the sum of the label likelihoods for each sequence is 1. If true, the labels are considered independent and probabilities are normalized for each candidate."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
new file mode 100644
index 000000000..478e0bef2
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
@@ -0,0 +1,20 @@
+{
+	"id": "http://huggingface.co/inference/schemas/zero-shot-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Zero Shot Classification task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "A candidate label"
+			},
+			"score": {
+				"type": "number",
+				"description": "The associated score / probability"
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
new file mode 100644
index 000000000..4e60af72e
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
@@ -0,0 +1,55 @@
+{
+	"id": "http://huggingface.co/inference/schemas/zero-shot-image-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Zero Shot Image Classification inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several images to classify",
+			"anyOf": [
+				{
+					"$ref": "#/definitions/ZeroShotImageClassificationInput"
+				},
+				{
+					"type": "array",
+					"items": {
+						"$ref": "#/definitions/ZeroShotImageClassificationInput"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/ZeroShotImageClassificationParameters"
+		}
+	},
+	"definitions": {
+		"ZeroShotImageClassificationInput": {
+			"type": "object",
+			"properties": {
+				"image": {
+					"description": "The image data to classify"
+				},
+				"candidateLabels": {
+					"description": "The candidate labels for this image",
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			},
+			"required": ["image", "candidateLabels"]
+		},
+		"ZeroShotImageClassificationParameters": {
+			"description": "Additional inference parameters for Zero Shot Image Classification",
+			"type": "object",
+			"properties": {
+				"hypothesisTemplate": {
+					"type": "string",
+					"description": "The sentence used in conjunction with candidateLabels to attempt the text classification by replacing the placeholder with the candidate labels."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
new file mode 100644
index 000000000..a400d6622
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
@@ -0,0 +1,20 @@
+{
+	"id": "http://huggingface.co/inference/schemas/zero-shot-image-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Zero Shot Image Classification task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "A candidate label"
+			},
+			"score": {
+				"type": "number",
+				"description": "The associated score / probability"
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
new file mode 100644
index 000000000..93e95f25f
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
@@ -0,0 +1,50 @@
+{
+	"id": "http://huggingface.co/inference/schemas/zero-shot-object-detection/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Zero Shot Object Detection inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several images to perform object detection on",
+			"anyOf": [
+				{
+					"$ref": "#/definitions/ZeroShotObjectDetectionInputs"
+				},
+				{
+					"type": "array",
+					"items": {
+						"$ref": "#/definitions/ZeroShotObjectDetectionInputs"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/ZeroShotObjectDetectionParameters"
+		}
+	},
+	"definitions": {
+		"ZeroShotObjectDetectionInputs": {
+			"type": "object",
+			"properties": {
+				"image": {
+					"description": "The image data to generate bounding boxes from"
+				},
+				"candidateLabels": {
+					"description": "The candidate labels for this image",
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			},
+			"required": ["image", "candidateLabels"]
+		},
+		"ZeroShotObjectDetectionParameters": {
+			"description": "Additional inference parameters for Zero Shot Object Detection",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
new file mode 100644
index 000000000..c5fd05eb3
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
@@ -0,0 +1,44 @@
+{
+	"id": "http://huggingface.co/inference/schemas/zero-shot-object-detection/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Zero Shot Object Detection task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "A candidate label"
+			},
+			"score": {
+				"type": "number",
+				"description": "The associated score / probability"
+			},
+			"box": {
+				"$ref": "#/definitions/BoundingBox",
+				"description": "The predicted bounding box. Coordinates are relative to the top left corner of the input image."
+			}
+		},
+		"required": ["box", "label", "score"]
+	},
+	"definitions": {
+		"BoundingBox": {
+			"type": "object",
+			"properties": {
+				"xmin": {
+					"type": "integer"
+				},
+				"xmax": {
+					"type": "integer"
+				},
+				"ymin": {
+					"type": "integer"
+				},
+				"ymax": {
+					"type": "integer"
+				}
+			},
+			"required": ["xmin", "xmax", "ymin", "ymax"]
+		}
+	}
+}

From 93c37f5cd9c7c72012229e21dcb9aebbc5d34ddc Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 17:15:01 +0100
Subject: [PATCH 08/51] =?UTF-8?q?=F0=9F=A9=B9=20Ignore=20placeholder=20whe?=
 =?UTF-8?q?n=20generating=20code?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/src/scripts/inference-codegen.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index 6edc31fea..0dcae447f 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -65,6 +65,7 @@ async function main() {
 	const allTasks = await Promise.all(
 		(await fs.readdir(tasksDir, { withFileTypes: true }))
 			.filter((entry) => entry.isDirectory())
+			.filter((entry) => entry.name !== "placeholder")
 			.map(async (entry) => ({ task: entry.name, dirPath: path.join(entry.path, entry.name) }))
 	);
 

From bbf72eca743a4d4ce3527d5d0f4b0a68a3e3e9b4 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 17:16:33 +0100
Subject: [PATCH 09/51] =?UTF-8?q?=F0=9F=A9=B9=20Fix:=20ensure=20spec=20fil?=
 =?UTF-8?q?es=20exist?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/src/scripts/inference-codegen.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index 0dcae447f..9ffe24bd4 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -71,7 +71,7 @@ async function main() {
 
 	for (const { task, dirPath } of allTasks) {
 		const taskSpecDir = path.join(dirPath, "spec");
-		if (!pathExists(taskSpecDir)) {
+		if (!(pathExists(path.join(taskSpecDir, "input.json")) && pathExists(path.join(taskSpecDir, "output.json")))) {
 			console.debug(`No spec found for task ${task} - skipping`);
 			continue;
 		}

From 16a9bebe0cbdbd82d59b2d6dbe1bcbf4177524e7 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 17:17:13 +0100
Subject: [PATCH 10/51] =?UTF-8?q?=E2=9C=A8=20Generate=20inference=20types?=
 =?UTF-8?q?=20for=20existing=20tasks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tasks/audio-classification/inference.ts   |   2 +-
 .../automatic-speech-recognition/inference.ts |  31 ++++++
 .../src/tasks/depth-estimation/inference.ts   |  33 ++++++
 .../document-question-answering/inference.ts  | 101 ++++++++++++++++++
 .../src/tasks/feature-extraction/inference.ts |  20 ++++
 .../tasks/src/tasks/fill-mask/inference.ts    |  63 +++++++++++
 .../tasks/image-classification/inference.ts   |  48 +++++++++
 .../src/tasks/image-segmentation/inference.ts |  64 +++++++++++
 .../src/tasks/image-to-image/inference.ts     |  20 ++++
 .../src/tasks/image-to-text/inference.ts      |  44 ++++++++
 .../src/tasks/object-detection/inference.ts   |  65 +++++++++++
 .../src/tasks/question-answering/inference.ts |  99 +++++++++++++++++
 .../table-question-answering/inference.ts     |  59 ++++++++++
 .../tasks/text-classification/inference.ts    |  54 ++++++++++
 .../src/tasks/text-generation/inference.ts    |   2 +-
 .../src/tasks/text-to-speech/inference.ts     |  35 ++++++
 .../tasks/token-classification/inference.ts   |  85 +++++++++++++++
 .../tasks/video-classification/inference.ts   |  56 ++++++++++
 .../visual-question-answering/inference.ts    |  63 +++++++++++
 .../zero-shot-classification/inference.ts     |  67 ++++++++++++
 .../inference.ts                              |  61 +++++++++++
 .../zero-shot-object-detection/inference.ts   |  64 +++++++++++
 22 files changed, 1134 insertions(+), 2 deletions(-)
 create mode 100644 packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
 create mode 100644 packages/tasks/src/tasks/depth-estimation/inference.ts
 create mode 100644 packages/tasks/src/tasks/document-question-answering/inference.ts
 create mode 100644 packages/tasks/src/tasks/feature-extraction/inference.ts
 create mode 100644 packages/tasks/src/tasks/fill-mask/inference.ts
 create mode 100644 packages/tasks/src/tasks/image-classification/inference.ts
 create mode 100644 packages/tasks/src/tasks/image-segmentation/inference.ts
 create mode 100644 packages/tasks/src/tasks/image-to-image/inference.ts
 create mode 100644 packages/tasks/src/tasks/image-to-text/inference.ts
 create mode 100644 packages/tasks/src/tasks/object-detection/inference.ts
 create mode 100644 packages/tasks/src/tasks/question-answering/inference.ts
 create mode 100644 packages/tasks/src/tasks/table-question-answering/inference.ts
 create mode 100644 packages/tasks/src/tasks/text-classification/inference.ts
 create mode 100644 packages/tasks/src/tasks/text-to-speech/inference.ts
 create mode 100644 packages/tasks/src/tasks/token-classification/inference.ts
 create mode 100644 packages/tasks/src/tasks/video-classification/inference.ts
 create mode 100644 packages/tasks/src/tasks/visual-question-answering/inference.ts
 create mode 100644 packages/tasks/src/tasks/zero-shot-classification/inference.ts
 create mode 100644 packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
 create mode 100644 packages/tasks/src/tasks/zero-shot-object-detection/inference.ts

diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index aa0e4e86c..33764ad33 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T14:59:10.562Z
+ * Generated on 2024-01-19T16:16:01.752Z
  */
 
 /**
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
new file mode 100644
index 000000000..84540aa70
--- /dev/null
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -0,0 +1,31 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Automatic Speech Recognition inference
+ */
+export interface AutomaticSpeechRecognitionInput {
+	/**
+	 * The input audio data
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: any };
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Automatic Speech Recognition task
+ */
+export interface AutomaticSpeechRecognitionOutput {
+	/**
+	 * The recognized text.
+	 */
+	text: string;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/depth-estimation/inference.ts b/packages/tasks/src/tasks/depth-estimation/inference.ts
new file mode 100644
index 000000000..0feea79b1
--- /dev/null
+++ b/packages/tasks/src/tasks/depth-estimation/inference.ts
@@ -0,0 +1,33 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Depth Estimation inference
+ */
+export interface DepthEstimationInput {
+	/**
+	 * The input image data
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: DepthEstimationParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Depth Estimation
+ */
+export interface DepthEstimationParameters {
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
new file mode 100644
index 000000000..ac1f84865
--- /dev/null
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -0,0 +1,101 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Document Question Answering inference
+ */
+export interface DocumentQuestionAnsweringInput {
+	/**
+	 * The
+	 */
+	inputs: DocumentAndQuestion[] | DocumentAndQuestion;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: DocumentQuestionAnsweringParameters;
+	[property: string]: any;
+}
+
+export interface DocumentAndQuestion {
+	/**
+	 * The image on which the question is asked
+	 */
+	image?: any;
+	/**
+	 * A question to ask of the document
+	 */
+	question?: string;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Document Question Answering
+ */
+export interface DocumentQuestionAnsweringParameters {
+	/**
+	 * If the words in the document are too long to fit with the question for the model, it will
+	 * be split in several chunks with some overlap. This argument controls the size of that
+	 * overlap.
+	 */
+	docStride?: number;
+	/**
+	 * Whether to accept impossible as an answer
+	 */
+	handleImpossibleAnswer?: boolean;
+	/**
+	 * Language to use while running OCR. Defaults to english.
+	 */
+	lang?: string;
+	/**
+	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
+	 * considered).
+	 */
+	maxAnswerLen?: number;
+	/**
+	 * The maximum length of the question after tokenization. It will be truncated if needed.
+	 */
+	maxQuestionLen?: number;
+	/**
+	 * The maximum length of the total sentence (context + question) in tokens of each chunk
+	 * passed to the model. The context will be split in several chunks (using doc_stride as
+	 * overlap) if needed.
+	 */
+	maxSeqLen?: number;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Can return less
+	 * than top_k answers if there are not enough options available within the context.
+	 */
+	topK?: number;
+	/**
+	 * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
+	 * skip the OCR step and use the provided bounding boxes instead.
+	 */
+	wordBoxes?: Array<number[] | string>;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Document Question Answering task
+ */
+export interface DocumentQuestionAnsweringOutput {
+	/**
+	 * The answer to the question.
+	 */
+	answer: string;
+	end: number;
+	/**
+	 * The probability associated to the answer.
+	 */
+	score: number;
+	start: number;
+	/**
+	 * The index of each word/box pair that is in the answer
+	 */
+	words: number[];
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
new file mode 100644
index 000000000..8a043fc20
--- /dev/null
+++ b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -0,0 +1,20 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Feature Extraction inference
+ */
+export interface FeatureExtractionInput {
+	/**
+	 * One or several texts to get the features of
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: any };
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/fill-mask/inference.ts b/packages/tasks/src/tasks/fill-mask/inference.ts
new file mode 100644
index 000000000..6a49d856d
--- /dev/null
+++ b/packages/tasks/src/tasks/fill-mask/inference.ts
@@ -0,0 +1,63 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Fill Mask inference
+ */
+export interface FillMaskInput {
+	/**
+	 * One or several texts with masked tokens
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: FillMaskParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Fill Mask
+ */
+export interface FillMaskParameters {
+	/**
+	 * When passed, the model will limit the scores to the passed targets instead of looking up
+	 * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
+	 * tokenized and the first resulting token will be used (with a warning, and that might be
+	 * slower).
+	 */
+	targets?: string[] | string;
+	/**
+	 * When passed, overrides the number of predictions to return.
+	 */
+	topK?: number;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Fill Mask task
+ */
+export interface FillMaskOutput {
+	/**
+	 * The corresponding probability
+	 */
+	score: number;
+	/**
+	 * The corresponding input with the mask token prediction.
+	 */
+	sequence: string;
+	/**
+	 * The predicted token id (to replace the masked one).
+	 */
+	token: number;
+	/**
+	 * The predicted token (to replace the masked one).
+	 */
+	tokenStr: string;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
new file mode 100644
index 000000000..0d5d438d6
--- /dev/null
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -0,0 +1,48 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Image Classification inference
+ */
+export interface ImageClassificationInput {
+	/**
+	 * On or several image files to classify
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageClassificationParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image Classification
+ */
+export interface ImageClassificationParameters {
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Image Classification task
+ */
+export interface ImageClassificationOutput {
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
new file mode 100644
index 000000000..88548bc2b
--- /dev/null
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -0,0 +1,64 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Image Segmentation inference
+ */
+export interface ImageSegmentationInput {
+	/**
+	 * One or several image files to perform segmentation on
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageSegmentationParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image Segmentation
+ */
+export interface ImageSegmentationParameters {
+	/**
+	 * Threshold to use when turning the predicted masks into binary values.
+	 */
+	maskThreshold?: number;
+	/**
+	 * Mask overlap threshold to eliminate small, disconnected segments.
+	 */
+	overlapMaskAreaThreshold?: number;
+	/**
+	 * Segmentation task to be performed, depending on model capabilities.
+	 */
+	subtask?: Subtask;
+	/**
+	 * Probability threshold to filter out predicted masks.
+	 */
+	threshold?: number;
+	[property: string]: any;
+}
+
+export type Subtask = "instance" | "panoptic" | "semantic";
+
+/**
+ * Outputs of inference for the Image Segmentation task
+ *
+ * A predicted mask / segment
+ */
+export interface ImageSegmentationOutput {
+	/**
+	 * The label of the predicted segment
+	 */
+	label: string;
+	/**
+	 * The corresponding mask as a black-and-white image
+	 */
+	mask: any;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
new file mode 100644
index 000000000..6150a5eb8
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -0,0 +1,20 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Image To Image inference
+ */
+export interface ImageToImageInput {
+	/**
+	 * One or more images to generate images from
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: any;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
new file mode 100644
index 000000000..66f520248
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -0,0 +1,44 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Image To Text inference
+ */
+export interface ImageToTextInput {
+	/**
+	 * One or several images to generated text for
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageToTextParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image To Text
+ */
+export interface ImageToTextParameters {
+	/**
+	 * The amount of maximum tokens to generate.
+	 */
+	maxNewTokens?: number;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Image To Text task
+ */
+export interface ImageToTextOutput {
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
new file mode 100644
index 000000000..22edb2cce
--- /dev/null
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -0,0 +1,65 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Object Detection inference
+ */
+export interface ObjectDetectionInput {
+	/**
+	 * One or several input images to perform object detection on
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ObjectDetectionParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Object Detection
+ */
+export interface ObjectDetectionParameters {
+	/**
+	 * The probability necessary to make a prediction.
+	 */
+	threshold?: number;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Object Detection task
+ */
+export interface ObjectDetectionOutput {
+	/**
+	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
+	 * image.
+	 */
+	box: BoundingBox;
+	/**
+	 * The predicted label for the bounding box
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: any;
+}
+
+/**
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
+ * image.
+ */
+export interface BoundingBox {
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
new file mode 100644
index 000000000..829e1fd1b
--- /dev/null
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -0,0 +1,99 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Question Answering inference
+ */
+export interface QuestionAnsweringInput {
+	/**
+	 * One or several question+context pairs to answer
+	 */
+	inputs: QuestionAnsweringInputElement[] | QuestionAnsweringInputElement;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: QuestionAnsweringParameters;
+	[property: string]: any;
+}
+
+export interface QuestionAnsweringInputElement {
+	/**
+	 * The context to be used for answering the question
+	 */
+	context: string;
+	/**
+	 * The question to be answered
+	 */
+	question: string;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Question Answering
+ */
+export interface QuestionAnsweringParameters {
+	/**
+	 * Attempts to align the answer to real words. Improves quality on space separated
+	 * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
+	 */
+	alignToWords?: boolean;
+	/**
+	 * If the context is too long to fit with the question for the model, it will be split in
+	 * several chunks with some overlap. This argument controls the size of that overlap.
+	 */
+	docStride?: number;
+	/**
+	 * Whether to accept impossible as an answer.
+	 */
+	handleImpossibleAnswer?: boolean;
+	/**
+	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
+	 * considered).
+	 */
+	maxAnswerLen?: number;
+	/**
+	 * The maximum length of the question after tokenization. It will be truncated if needed.
+	 */
+	maxQuestionLen?: number;
+	/**
+	 * The maximum length of the total sentence (context + question) in tokens of each chunk
+	 * passed to the model. The context will be split in several chunks (using docStride as
+	 * overlap) if needed.
+	 */
+	maxSeqLen?: number;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Note that we
+	 * return less than topk answers if there are not enough options available within the
+	 * context.
+	 */
+	topK?: number;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Question Answering task
+ */
+export interface QuestionAnsweringOutput {
+	/**
+	 * The answer to the question.
+	 */
+	answer: string;
+	/**
+	 * The character position in the input where the answer ends.
+	 */
+	end: number;
+	/**
+	 * The probability associated to the answer.
+	 */
+	score: number;
+	/**
+	 * The character position in the input where the answer begins.
+	 */
+	start: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
new file mode 100644
index 000000000..8b7a3d227
--- /dev/null
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -0,0 +1,59 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Table Question Answering inference
+ */
+export interface TableQuestionAnsweringInput {
+	/**
+	 * One or several questions about a table
+	 */
+	inputs: Inputs;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: any };
+	[property: string]: any;
+}
+
+/**
+ * One or several questions about a table
+ */
+export interface Inputs {
+	/**
+	 * One or several questions to be answered about the table
+	 */
+	question?: string[] | string;
+	/**
+	 * The table to serve as context for the questions
+	 */
+	table?: { [key: string]: any };
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Table Question Answering task
+ */
+export interface TableQuestionAnsweringOutput {
+	/**
+	 * If the model has an aggregator, this returns the aggregator.
+	 */
+	aggregator?: string;
+	/**
+	 * The answer of the question given the table. If there is an aggregator, the answer will be
+	 * preceded by `AGGREGATOR >`.
+	 */
+	answer: string;
+	/**
+	 * List of strings made up of the answer cell values.
+	 */
+	cells: string[];
+	/**
+	 * Coordinates of the cells of the answers.
+	 */
+	coordinates: Array<number[]>;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
new file mode 100644
index 000000000..75a7032a2
--- /dev/null
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -0,0 +1,54 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Text Classification inference
+ */
+export interface TextClassificationInput {
+	/**
+	 * One or several texts to classify
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextClassificationParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text Classification
+ */
+export interface TextClassificationParameters {
+	/**
+	 * The function to apply to the model outputs in order to retrieve the scores.
+	 */
+	functionToApply?: FunctionToApply;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: any;
+}
+
+export type FunctionToApply = "sigmoid" | "softmax" | "none";
+
+/**
+ * Outputs of inference for the Text Classification task
+ */
+export interface TextClassificationOutput {
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index 2db6493ba..86725e74e 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T14:59:10.562Z
+ * Generated on 2024-01-19T16:16:01.752Z
  */
 
 /**
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
new file mode 100644
index 000000000..23ad75189
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -0,0 +1,35 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Text To Audio inference
+ */
+export interface TextToSpeechInput {
+	/**
+	 * One or several texts to generate audio for
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: any };
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Text To Audio task
+ */
+export interface TextToSpeechOutput {
+	/**
+	 * The generated audio waveform.
+	 */
+	audio: any;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	samplingRate: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
new file mode 100644
index 000000000..3b7e5e0b3
--- /dev/null
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -0,0 +1,85 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Token Classification inference
+ */
+export interface TokenClassificationInput {
+	/**
+	 * One or several texts which tokens are to be classified
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TokenClassificationParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Token Classification
+ */
+export interface TokenClassificationParameters {
+	/**
+	 * The strategy used to fuse tokens based on model predictions
+	 */
+	aggregationStrategy?: AggregationStrategy;
+	/**
+	 * A list of labels to ignore
+	 */
+	ignoreLabels?: string[];
+	/**
+	 * The number of overlapping tokens between chunks when splitting the input text.
+	 */
+	stride?: number;
+	[property: string]: any;
+}
+
+/**
+ * Do not aggregate tokens
+ *
+ * Group consecutive tokens with the same label in a single entity.
+ *
+ * Similar to "simple", also preserves word integrity (use the label predicted for the first
+ * token in a word).
+ *
+ * Similar to "simple", also preserves word integrity (uses the label with the highest
+ * score, averaged across the word's tokens).
+ *
+ * Similar to "simple", also preserves word integrity (uses the label with the highest score
+ * across the word's tokens).
+ */
+export type AggregationStrategy = "none" | "simple" | "first" | "average" | "max";
+
+/**
+ * Outputs of inference for the Token Classification task
+ */
+export interface TokenClassificationOutput {
+	/**
+	 * The character position in the input where this group ends.
+	 */
+	end?: number;
+	/**
+	 * The predicted label for that group of tokens
+	 */
+	entityGroup?: string;
+	label: any;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	/**
+	 * The character position in the input where this group begins.
+	 */
+	start?: number;
+	/**
+	 * The corresponding text
+	 */
+	word?: string;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
new file mode 100644
index 000000000..b6faf9d0f
--- /dev/null
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -0,0 +1,56 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Video Classification inference
+ */
+export interface VideoClassificationInput {
+	/**
+	 * One or several videos to be classified
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: VideoClassificationParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Video Classification
+ */
+export interface VideoClassificationParameters {
+	/**
+	 * The sampling rate used to select frames from the video.
+	 */
+	frameSamplingRate?: number;
+	/**
+	 * The number of sampled frames to consider for classification.
+	 */
+	numFrames?: number;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Video Classification task
+ */
+export interface VideoClassificationOutput {
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
new file mode 100644
index 000000000..c2175d49c
--- /dev/null
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -0,0 +1,63 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Visual Question Answering inference
+ */
+export interface VisualQuestionAnsweringInput {
+	/**
+	 * One or more image-question pairs
+	 */
+	inputs: VisualQuestionAnsweringInputElement[] | VisualQuestionAnsweringInputElement;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: VisualQuestionAnsweringParameters;
+	[property: string]: any;
+}
+
+export interface VisualQuestionAnsweringInputElement {
+	/**
+	 * The image.
+	 */
+	image: any;
+	/**
+	 * The question to answer based on the image.
+	 */
+	question: any;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Visual Question Answering
+ */
+export interface VisualQuestionAnsweringParameters {
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Note that we
+	 * return less than topk answers if there are not enough options available within the
+	 * context.
+	 */
+	topK?: number;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Visual Question Answering task
+ */
+export interface VisualQuestionAnsweringOutput {
+	/**
+	 * The answer to the question
+	 */
+	answer?: string;
+	label: any;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
new file mode 100644
index 000000000..67cd325ea
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -0,0 +1,67 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Zero Shot Classification inference
+ */
+export interface ZeroShotClassificationInput {
+	/**
+	 * One or several text + candidate labels pairs to classify
+	 */
+	inputs: ZeroShotClassificationInputElement[] | ZeroShotClassificationInputElement;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ZeroShotClassificationParameters;
+	[property: string]: any;
+}
+
+export interface ZeroShotClassificationInputElement {
+	/**
+	 * The set of possible class labels to classify the text into.
+	 */
+	candidateLabels: string[];
+	/**
+	 * The text to classify
+	 */
+	text: string;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Zero Shot Classification
+ */
+export interface ZeroShotClassificationParameters {
+	/**
+	 * The sentence used in conjunction with candidateLabels to attempt the text classification
+	 * by replacing the placeholder with the candidate labels.
+	 */
+	hypothesisTemplate?: string;
+	/**
+	 * Whether multiple candidate labels can be true. If false, the scores are normalized such
+	 * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
+	 * considered independent and probabilities are normalized for each candidate.
+	 */
+	multiLabel?: boolean;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Zero Shot Classification task
+ */
+export interface ZeroShotClassificationOutput {
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
new file mode 100644
index 000000000..21e01d179
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -0,0 +1,61 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Zero Shot Image Classification inference
+ */
+export interface ZeroShotImageClassificationInput {
+	/**
+	 * One or several images to classify
+	 */
+	inputs: ZeroShotImageClassificationInputElement[] | ZeroShotImageClassificationInputElement;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ZeroShotImageClassificationParameters;
+	[property: string]: any;
+}
+
+export interface ZeroShotImageClassificationInputElement {
+	/**
+	 * The candidate labels for this image
+	 */
+	candidateLabels: string[];
+	/**
+	 * The image data to classify
+	 */
+	image: any;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Zero Shot Image Classification
+ */
+export interface ZeroShotImageClassificationParameters {
+	/**
+	 * The sentence used in conjunction with candidateLabels to attempt the text classification
+	 * by replacing the placeholder with the candidate labels.
+	 */
+	hypothesisTemplate?: string;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Zero Shot Image Classification task
+ */
+export interface ZeroShotImageClassificationOutput {
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
new file mode 100644
index 000000000..815c99cfe
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -0,0 +1,64 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Zero Shot Object Detection inference
+ */
+export interface ZeroShotObjectDetectionInput {
+	/**
+	 * One or several images to perform object detection on
+	 */
+	inputs: ZeroShotObjectDetectionInputs[] | ZeroShotObjectDetectionInputs;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: any };
+	[property: string]: any;
+}
+
+export interface ZeroShotObjectDetectionInputs {
+	/**
+	 * The candidate labels for this image
+	 */
+	candidateLabels: string[];
+	/**
+	 * The image data to generate bounding boxes from
+	 */
+	image: any;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Zero Shot Object Detection task
+ */
+export interface ZeroShotObjectDetectionOutput {
+	/**
+	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
+	 * image.
+	 */
+	box: BoundingBox;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: any;
+}
+
+/**
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
+ * image.
+ */
+export interface BoundingBox {
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: any;
+}

From b27846cbb4253685fcc65cf010694bed9b23e75f Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 17:59:22 +0100
Subject: [PATCH 11/51] =?UTF-8?q?=E2=9C=A8=20Support=20cross-file=20refere?=
 =?UTF-8?q?nces?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/src/scripts/inference-codegen.ts | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index 9ffe24bd4..d256c81c5 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -8,7 +8,7 @@ const TYPESCRIPT_HEADER_FILE = `
 /**
  * Inference code generated from the JSON schema spec in ./spec
  * 
- * Generated on ${new Date().toISOString()}
+ * Using src/scripts/inference-codegen
  */
 
 `;
@@ -30,8 +30,14 @@ const rootDirFinder = function (): string {
 	return "";
 };
 
-async function buildInputData(taskId: string, taskSpecDir: string): Promise<InputData> {
-	const schema = new JSONSchemaInput(new FetchingJSONSchemaStore());
+/**
+ *
+ * @param taskId The ID of the task for which we are generating code
+ * @param taskSpecDir The path to the directory where the input.json & output.json files are
+ * @param allSpecFiles An array of paths to all the tasks specs. Allows resolving cross-file references ($ref).
+ */
+async function buildInputData(taskId: string, taskSpecDir: string, allSpecFiles: string[]): Promise<InputData> {
+	const schema = new JSONSchemaInput(new FetchingJSONSchemaStore(), [], allSpecFiles);
 	await schema.addSource({
 		name: `${taskId}-input`,
 		schema: await fs.readFile(`${taskSpecDir}/input.json`, { encoding: "utf-8" }),
@@ -68,6 +74,9 @@ async function main() {
 			.filter((entry) => entry.name !== "placeholder")
 			.map(async (entry) => ({ task: entry.name, dirPath: path.join(entry.path, entry.name) }))
 	);
+	const allSpecFiles = allTasks
+		.flatMap(({ dirPath }) => [path.join(dirPath, "spec", "input.json"), path.join(dirPath, "spec", "output.json")])
+		.filter((filepath) => pathExists(filepath));
 
 	for (const { task, dirPath } of allTasks) {
 		const taskSpecDir = path.join(dirPath, "spec");
@@ -78,7 +87,7 @@ async function main() {
 		console.debug(`✨ Generating types for task`, task);
 
 		console.debug("   📦 Building input data");
-		const inputData = await buildInputData(task, taskSpecDir);
+		const inputData = await buildInputData(task, taskSpecDir, allSpecFiles);
 
 		console.debug("   🏭 Generating typescript code");
 		{

From 7d9a9f63d49009629f7564c8ae42bbc60ebb4e8c Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 18:00:36 +0100
Subject: [PATCH 12/51] regen following header change

---
 packages/tasks/src/tasks/audio-classification/inference.ts      | 2 +-
 .../tasks/src/tasks/automatic-speech-recognition/inference.ts   | 2 +-
 packages/tasks/src/tasks/depth-estimation/inference.ts          | 2 +-
 .../tasks/src/tasks/document-question-answering/inference.ts    | 2 +-
 packages/tasks/src/tasks/feature-extraction/inference.ts        | 2 +-
 packages/tasks/src/tasks/fill-mask/inference.ts                 | 2 +-
 packages/tasks/src/tasks/image-classification/inference.ts      | 2 +-
 packages/tasks/src/tasks/image-segmentation/inference.ts        | 2 +-
 packages/tasks/src/tasks/image-to-image/inference.ts            | 2 +-
 packages/tasks/src/tasks/image-to-text/inference.ts             | 2 +-
 packages/tasks/src/tasks/object-detection/inference.ts          | 2 +-
 packages/tasks/src/tasks/question-answering/inference.ts        | 2 +-
 packages/tasks/src/tasks/table-question-answering/inference.ts  | 2 +-
 packages/tasks/src/tasks/text-classification/inference.ts       | 2 +-
 packages/tasks/src/tasks/text-generation/inference.ts           | 2 +-
 packages/tasks/src/tasks/text-to-speech/inference.ts            | 2 +-
 packages/tasks/src/tasks/token-classification/inference.ts      | 2 +-
 packages/tasks/src/tasks/video-classification/inference.ts      | 2 +-
 packages/tasks/src/tasks/visual-question-answering/inference.ts | 2 +-
 packages/tasks/src/tasks/zero-shot-classification/inference.ts  | 2 +-
 .../tasks/src/tasks/zero-shot-image-classification/inference.ts | 2 +-
 .../tasks/src/tasks/zero-shot-object-detection/inference.ts     | 2 +-
 22 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index 33764ad33..a1f068a48 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index 84540aa70..87a78c95b 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/depth-estimation/inference.ts b/packages/tasks/src/tasks/depth-estimation/inference.ts
index 0feea79b1..19fa43c11 100644
--- a/packages/tasks/src/tasks/depth-estimation/inference.ts
+++ b/packages/tasks/src/tasks/depth-estimation/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index ac1f84865..dc5d92e05 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
index 8a043fc20..b905bc3dc 100644
--- a/packages/tasks/src/tasks/feature-extraction/inference.ts
+++ b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/fill-mask/inference.ts b/packages/tasks/src/tasks/fill-mask/inference.ts
index 6a49d856d..c603d73d5 100644
--- a/packages/tasks/src/tasks/fill-mask/inference.ts
+++ b/packages/tasks/src/tasks/fill-mask/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index 0d5d438d6..6114ff9cc 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index 88548bc2b..b9131b11f 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
index 6150a5eb8..c1c371033 100644
--- a/packages/tasks/src/tasks/image-to-image/inference.ts
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index 66f520248..917b8bf0a 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index 22edb2cce..ebcd5eeb1 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
index 829e1fd1b..c5df0be04 100644
--- a/packages/tasks/src/tasks/question-answering/inference.ts
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
index 8b7a3d227..58adb8c06 100644
--- a/packages/tasks/src/tasks/table-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
index 75a7032a2..dc924889c 100644
--- a/packages/tasks/src/tasks/text-classification/inference.ts
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index 86725e74e..4d86e0a99 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index 23ad75189..75c54a166 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
index 3b7e5e0b3..dffcdcf38 100644
--- a/packages/tasks/src/tasks/token-classification/inference.ts
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
index b6faf9d0f..7b2de7049 100644
--- a/packages/tasks/src/tasks/video-classification/inference.ts
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
index c2175d49c..14dc539e9 100644
--- a/packages/tasks/src/tasks/visual-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
index 67cd325ea..0908a599b 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 21e01d179..46e7860c3 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index 815c99cfe..fe227f31f 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**

From dbd0254d3929b272b023e02b9801fba5652b3e2f Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 18:01:28 +0100
Subject: [PATCH 13/51] =?UTF-8?q?=E2=9C=A8=20Add=20text2text-generation=20?=
 =?UTF-8?q?task=20&=20reference=20it=20from=20summarization/translation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/tasks/summarization/inference.ts      | 59 ++++++++++++++++++
 .../src/tasks/summarization/spec/input.json   |  6 ++
 .../src/tasks/summarization/spec/output.json  |  6 ++
 .../src/tasks/text2text-generation/about.md   | 15 +++++
 .../src/tasks/text2text-generation/data.ts    | 18 ++++++
 .../tasks/text2text-generation/inference.ts   | 55 +++++++++++++++++
 .../text2text-generation/spec/input.json      | 61 +++++++++++++++++++
 .../text2text-generation/spec/output.json     | 16 +++++
 .../tasks/src/tasks/translation/inference.ts  | 59 ++++++++++++++++++
 .../src/tasks/translation/spec/input.json     |  6 ++
 .../src/tasks/translation/spec/output.json    |  6 ++
 11 files changed, 307 insertions(+)
 create mode 100644 packages/tasks/src/tasks/summarization/inference.ts
 create mode 100644 packages/tasks/src/tasks/summarization/spec/input.json
 create mode 100644 packages/tasks/src/tasks/summarization/spec/output.json
 create mode 100644 packages/tasks/src/tasks/text2text-generation/about.md
 create mode 100644 packages/tasks/src/tasks/text2text-generation/data.ts
 create mode 100644 packages/tasks/src/tasks/text2text-generation/inference.ts
 create mode 100644 packages/tasks/src/tasks/text2text-generation/spec/input.json
 create mode 100644 packages/tasks/src/tasks/text2text-generation/spec/output.json
 create mode 100644 packages/tasks/src/tasks/translation/inference.ts
 create mode 100644 packages/tasks/src/tasks/translation/spec/input.json
 create mode 100644 packages/tasks/src/tasks/translation/spec/output.json

diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
new file mode 100644
index 000000000..e4464c748
--- /dev/null
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -0,0 +1,59 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Summarization inference
+ *
+ * Inputs for Text2text Generation inference
+ */
+export interface SummarizationInput {
+	/**
+	 * One or more texts to use for text2text generation
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Parameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text2text Generation
+ */
+export interface Parameters {
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	generateParameters?: { [key: string]: any };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Truncation;
+	[property: string]: any;
+}
+
+export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+
+/**
+ * Outputs for Summarization inference
+ *
+ * Outputs of inference for the Text2text Generation task
+ */
+export interface SummarizationOutput {
+	generatedTex: any;
+	/**
+	 * The generated text.
+	 */
+	generatedText?: string;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/summarization/spec/input.json b/packages/tasks/src/tasks/summarization/spec/input.json
new file mode 100644
index 000000000..b7c09d1db
--- /dev/null
+++ b/packages/tasks/src/tasks/summarization/spec/input.json
@@ -0,0 +1,6 @@
+{
+	"$ref": "/inference/schemas/text2text-generation/input.json",
+	"$id": "/inference/schemas/summarization/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Summarization inference"
+}
diff --git a/packages/tasks/src/tasks/summarization/spec/output.json b/packages/tasks/src/tasks/summarization/spec/output.json
new file mode 100644
index 000000000..df7331ee6
--- /dev/null
+++ b/packages/tasks/src/tasks/summarization/spec/output.json
@@ -0,0 +1,6 @@
+{
+	"$ref": "/inference/schemas/text2text-generation/output.json",
+	"$id": "/inference/schemas/summarization/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs for Summarization inference"
+}
diff --git a/packages/tasks/src/tasks/text2text-generation/about.md b/packages/tasks/src/tasks/text2text-generation/about.md
new file mode 100644
index 000000000..fdb455844
--- /dev/null
+++ b/packages/tasks/src/tasks/text2text-generation/about.md
@@ -0,0 +1,15 @@
+## Use Cases
+
+You can contribute this area with common use cases of the task!
+
+## Task Variants
+
+This place can be filled with variants of this task if there's any.
+
+## Inference
+
+This section should have useful information about how to pull a model from Hugging Face Hub that is a part of a library specialized in a task and use it.
+
+## Useful Resources
+
+In this area, you can insert useful resources about how to train or use a model for this task.
diff --git a/packages/tasks/src/tasks/text2text-generation/data.ts b/packages/tasks/src/tasks/text2text-generation/data.ts
new file mode 100644
index 000000000..7a7097e59
--- /dev/null
+++ b/packages/tasks/src/tasks/text2text-generation/data.ts
@@ -0,0 +1,18 @@
+import type { TaskDataCustom } from "..";
+
+const taskData: TaskDataCustom = {
+	datasets: [],
+	demo: {
+		inputs: [],
+		outputs: [],
+	},
+	isPlaceholder: false,
+	metrics: [],
+	models: [],
+	spaces: [],
+	summary: "",
+	widgetModels: [],
+	youtubeId: undefined,
+};
+
+export default taskData;
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
new file mode 100644
index 000000000..5f144702f
--- /dev/null
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -0,0 +1,55 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Text2text Generation inference
+ */
+export interface Text2TextGenerationInput {
+	/**
+	 * One or more texts to use for text2text generation
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Parameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text2text Generation
+ */
+export interface Parameters {
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	generateParameters?: { [key: string]: any };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Truncation;
+	[property: string]: any;
+}
+
+export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+
+/**
+ * Outputs of inference for the Text2text Generation task
+ */
+export interface Text2TextGenerationOutput {
+	generatedTex: any;
+	/**
+	 * The generated text.
+	 */
+	generatedText?: string;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/input.json b/packages/tasks/src/tasks/text2text-generation/spec/input.json
new file mode 100644
index 000000000..1b7077c65
--- /dev/null
+++ b/packages/tasks/src/tasks/text2text-generation/spec/input.json
@@ -0,0 +1,61 @@
+{
+	"$id": "/inference/schemas/text2text-generation/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text2text Generation inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or more texts to use for text2text generation",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/Text2textGenerationParameters"
+		}
+	},
+	"$defs": {
+		"Text2textGenerationParameters": {
+			"description": "Additional inference parameters for Text2text Generation",
+			"type": "object",
+			"properties": {
+				"cleanUpTokenizationSpaces": {
+					"type": "boolean",
+					"description": "Whether to clean up the potential extra spaces in the text output."
+				},
+				"truncation": {
+					"type": "string",
+					"description": "The truncation strategy to use",
+					"oneOf": [
+						{
+							"const": "do_not_truncate"
+						},
+						{
+							"const": "longest_first"
+						},
+						{
+							"const": "only_first"
+						},
+						{
+							"const": "only_second"
+						}
+					]
+				},
+				"generateParameters": {
+					"type": "object",
+					"description": "Additional parametrization of the text generation algorithm"
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/output.json b/packages/tasks/src/tasks/text2text-generation/spec/output.json
new file mode 100644
index 000000000..5d6cf0cee
--- /dev/null
+++ b/packages/tasks/src/tasks/text2text-generation/spec/output.json
@@ -0,0 +1,16 @@
+{
+	"$id": "/inference/schemas/text2text-generation/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text2text Generation task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"generatedText": {
+				"type": "string",
+				"description": "The generated text."
+			}
+		},
+		"required": ["generatedTex"]
+	}
+}
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
new file mode 100644
index 000000000..db7f74739
--- /dev/null
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -0,0 +1,59 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Translation inference
+ *
+ * Inputs for Text2text Generation inference
+ */
+export interface TranslationInput {
+	/**
+	 * One or more texts to use for text2text generation
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Parameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text2text Generation
+ */
+export interface Parameters {
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	generateParameters?: { [key: string]: any };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Truncation;
+	[property: string]: any;
+}
+
+export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+
+/**
+ * Outputs for Translation inference
+ *
+ * Outputs of inference for the Text2text Generation task
+ */
+export interface TranslationOutput {
+	generatedTex: any;
+	/**
+	 * The generated text.
+	 */
+	generatedText?: string;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/translation/spec/input.json b/packages/tasks/src/tasks/translation/spec/input.json
new file mode 100644
index 000000000..e3aac752c
--- /dev/null
+++ b/packages/tasks/src/tasks/translation/spec/input.json
@@ -0,0 +1,6 @@
+{
+	"$ref": "/inference/schemas/text2text-generation/input.json",
+	"$id": "/inference/schemas/translation/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Translation inference"
+}
diff --git a/packages/tasks/src/tasks/translation/spec/output.json b/packages/tasks/src/tasks/translation/spec/output.json
new file mode 100644
index 000000000..6dcb98077
--- /dev/null
+++ b/packages/tasks/src/tasks/translation/spec/output.json
@@ -0,0 +1,6 @@
+{
+	"$ref": "/inference/schemas/text2text-generation/output.json",
+	"$id": "/inference/schemas/translation/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs for Translation inference"
+}

From 6d903489de637b1f83f31560fbb052683309a17c Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 18:31:54 +0100
Subject: [PATCH 14/51] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Use=20$id,=20$defs?=
 =?UTF-8?q?=20&=20title?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/tasks/audio-classification/spec/input.json |  7 ++++---
 .../tasks/audio-classification/spec/output.json    |  2 +-
 .../automatic-speech-recognition/spec/input.json   |  7 ++++---
 .../automatic-speech-recognition/spec/output.json  |  2 +-
 .../src/tasks/depth-estimation/spec/input.json     |  7 ++++---
 .../src/tasks/depth-estimation/spec/output.json    |  2 +-
 .../tasks/document-question-answering/inference.ts |  4 ++--
 .../document-question-answering/spec/input.json    | 11 ++++++-----
 .../document-question-answering/spec/output.json   |  2 +-
 .../src/tasks/feature-extraction/spec/input.json   |  7 ++++---
 .../src/tasks/feature-extraction/spec/output.json  |  9 +++++----
 packages/tasks/src/tasks/fill-mask/spec/input.json |  7 ++++---
 .../tasks/src/tasks/fill-mask/spec/output.json     |  2 +-
 .../src/tasks/image-classification/spec/input.json |  7 ++++---
 .../tasks/image-classification/spec/output.json    |  2 +-
 .../src/tasks/image-segmentation/spec/input.json   |  7 ++++---
 .../src/tasks/image-segmentation/spec/output.json  |  2 +-
 .../tasks/src/tasks/image-to-image/spec/input.json |  7 ++++---
 .../src/tasks/image-to-image/spec/output.json      |  2 +-
 .../tasks/src/tasks/image-to-text/spec/input.json  |  7 ++++---
 .../tasks/src/tasks/image-to-text/spec/output.json |  2 +-
 .../tasks/src/tasks/object-detection/inference.ts  |  4 ++--
 .../src/tasks/object-detection/spec/input.json     |  7 ++++---
 .../src/tasks/object-detection/spec/output.json    |  6 +++---
 .../src/tasks/question-answering/inference.ts      |  4 ++--
 .../src/tasks/question-answering/spec/input.json   | 14 ++++++++------
 .../src/tasks/question-answering/spec/output.json  |  3 ++-
 .../tasks/src/tasks/summarization/inference.ts     |  6 +++---
 .../tasks/table-question-answering/spec/input.json |  7 ++++---
 .../table-question-answering/spec/output.json      |  2 +-
 .../src/tasks/text-classification/spec/input.json  |  7 ++++---
 .../src/tasks/text-classification/spec/output.json |  2 +-
 .../src/tasks/text-generation/spec/input.json      |  7 ++++---
 .../src/tasks/text-generation/spec/output.json     |  2 +-
 .../tasks/src/tasks/text-to-speech/spec/input.json |  7 ++++---
 .../src/tasks/text-to-speech/spec/output.json      |  2 +-
 .../src/tasks/text2text-generation/inference.ts    |  6 +++---
 .../src/tasks/text2text-generation/spec/input.json |  4 +++-
 .../src/tasks/token-classification/spec/input.json |  7 ++++---
 .../tasks/token-classification/spec/output.json    |  2 +-
 packages/tasks/src/tasks/translation/inference.ts  |  6 +++---
 .../src/tasks/video-classification/spec/input.json |  7 ++++---
 .../tasks/video-classification/spec/output.json    |  2 +-
 .../tasks/visual-question-answering/inference.ts   |  4 ++--
 .../visual-question-answering/spec/input.json      | 14 ++++++++------
 .../visual-question-answering/spec/output.json     |  2 +-
 .../tasks/zero-shot-classification/inference.ts    |  4 ++--
 .../tasks/zero-shot-classification/spec/input.json | 14 ++++++++------
 .../zero-shot-classification/spec/output.json      |  2 +-
 .../zero-shot-image-classification/inference.ts    |  4 ++--
 .../zero-shot-image-classification/spec/input.json | 14 ++++++++------
 .../spec/output.json                               |  2 +-
 .../tasks/zero-shot-object-detection/inference.ts  |  8 ++++----
 .../zero-shot-object-detection/spec/input.json     | 14 ++++++++------
 .../zero-shot-object-detection/spec/output.json    |  6 +++---
 55 files changed, 169 insertions(+), 138 deletions(-)

diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index f2f3fbfbf..29357710d 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/audio-classification/input.json",
+	"$id": "/inference/schemas/audio-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Audio Classification inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/AudioClassificationParameters"
+			"$ref": "#/$defs/AudioClassificationParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"AudioClassificationParameters": {
+			"title": "AudioClassificationParameters",
 			"description": "Additional inference parameters for Audio Classification",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/audio-classification/spec/output.json b/packages/tasks/src/tasks/audio-classification/spec/output.json
index ddacf5872..83e7abe71 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/output.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/audio-classification/output.json",
+	"$id": "/inference/schemas/audio-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"title": "AudioClassificationOutput",
 	"description": "Outputs for Audio Classification inference",
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
index dfd1c4bdb..a4034b5e1 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/automatic-speech-recognition/input.json",
+	"$id": "/inference/schemas/automatic-speech-recognition/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Automatic Speech Recognition inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/AutomaticSpeechRecognitionParameters"
+			"$ref": "#/$defs/AutomaticSpeechRecognitionParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"AutomaticSpeechRecognitionParameters": {
+			"title": "AutomaticSpeechRecognitionParameters",
 			"description": "Additional inference parameters for Automatic Speech Recognition",
 			"type": "object",
 			"properties": {}
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
index e11153af6..a8b8af782 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/automatic-speech-recognition/output.json",
+	"$id": "/inference/schemas/automatic-speech-recognition/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Automatic Speech Recognition task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/input.json b/packages/tasks/src/tasks/depth-estimation/spec/input.json
index 8483f13b5..f33df6444 100644
--- a/packages/tasks/src/tasks/depth-estimation/spec/input.json
+++ b/packages/tasks/src/tasks/depth-estimation/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/depth-estimation/input.json",
+	"$id": "/inference/schemas/depth-estimation/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Depth Estimation inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/DepthEstimationParameters"
+			"$ref": "#/$defs/DepthEstimationParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"DepthEstimationParameters": {
+			"title": "DepthEstimationParameters",
 			"description": "Additional inference parameters for Depth Estimation",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/output.json b/packages/tasks/src/tasks/depth-estimation/spec/output.json
index 643aaaa7b..c3ebebcc5 100644
--- a/packages/tasks/src/tasks/depth-estimation/spec/output.json
+++ b/packages/tasks/src/tasks/depth-estimation/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/depth-estimation/output.json",
+	"$id": "/inference/schemas/depth-estimation/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Depth Estimation task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index dc5d92e05..5a8eeeb08 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -11,7 +11,7 @@ export interface DocumentQuestionAnsweringInput {
 	/**
 	 * The
 	 */
-	inputs: DocumentAndQuestion[] | DocumentAndQuestion;
+	inputs: DocumentQuestionAnsweringInpu[] | DocumentQuestionAnsweringInpu;
 	/**
 	 * Additional inference parameters
 	 */
@@ -19,7 +19,7 @@ export interface DocumentQuestionAnsweringInput {
 	[property: string]: any;
 }
 
-export interface DocumentAndQuestion {
+export interface DocumentQuestionAnsweringInpu {
 	/**
 	 * The image on which the question is asked
 	 */
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
index dc72a24b2..86d0708c5 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/document-question-answering/input.json",
+	"$id": "/inference/schemas/document-question-answering/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Document Question Answering inference",
 	"type": "object",
@@ -8,23 +8,24 @@
 			"description": "The ",
 			"anyOf": [
 				{
-					"$ref": "#/definitions/DocumentAndQuestion"
+					"$ref": "#/$defs/DocumentAndQuestion"
 				},
 				{
 					"type": "array",
 					"items": {
-						"$ref": "#/definitions/DocumentAndQuestion"
+						"$ref": "#/$defs/DocumentAndQuestion"
 					}
 				}
 			]
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/DocumentQuestionAnsweringParameters"
+			"$ref": "#/$defs/DocumentQuestionAnsweringParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"DocumentQuestionAnsweringParameters": {
+			"title": "DocumentQuestionAnsweringParameters",
 			"description": "Additional inference parameters for Document Question Answering",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/output.json b/packages/tasks/src/tasks/document-question-answering/spec/output.json
index 60f6b5314..4c7752775 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/output.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/document-question-answering/output.json",
+	"$id": "/inference/schemas/document-question-answering/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Document Question Answering task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/input.json b/packages/tasks/src/tasks/feature-extraction/spec/input.json
index afa1ec998..8bf05339a 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/input.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/feature-extraction/input.json",
+	"$id": "/inference/schemas/feature-extraction/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Feature Extraction inference",
 	"type": "object",
@@ -20,11 +20,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/FeatureExtractionParameters"
+			"$ref": "#/$defs/FeatureExtractionParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"FeatureExtractionParameters": {
+			"title": "FeatureExtractionParameters",
 			"description": "Additional inference parameters for Feature Extraction",
 			"type": "object",
 			"properties": {}
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/output.json b/packages/tasks/src/tasks/feature-extraction/spec/output.json
index f2e0ce2bf..4fac04cfe 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/output.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/output.json
@@ -1,14 +1,15 @@
 {
-	"id": "http://huggingface.co/inference/schemas/feature-extraction/output.json",
+	"$id": "/inference/schemas/feature-extraction/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Feature Extraction task",
 	"type": "array",
 	"items": {
 		"description": "The features computed by the mode, as a nested list of floats",
-		"$ref": "#/definitions/FeatureDimension"
+		"$ref": "#/$defs/FeatureTensor"
 	},
-	"definitions": {
-		"FeatureDimension": {
+	"$defs": {
+		"FeatureTensor": {
+			"title": "FeatureTensor",
 			"type": "array",
 			"items": {
 				"anyOf": [
diff --git a/packages/tasks/src/tasks/fill-mask/spec/input.json b/packages/tasks/src/tasks/fill-mask/spec/input.json
index b0588e21a..6f7402efb 100644
--- a/packages/tasks/src/tasks/fill-mask/spec/input.json
+++ b/packages/tasks/src/tasks/fill-mask/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/fill-mask/input.json",
+	"$id": "/inference/schemas/fill-mask/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Fill Mask inference",
 	"type": "object",
@@ -20,11 +20,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/FillMaskParameters"
+			"$ref": "#/$defs/FillMaskParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"FillMaskParameters": {
+			"title": "FillMaskParameters",
 			"description": "Additional inference parameters for Fill Mask",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/fill-mask/spec/output.json b/packages/tasks/src/tasks/fill-mask/spec/output.json
index 9ecf5aff8..3453d65d4 100644
--- a/packages/tasks/src/tasks/fill-mask/spec/output.json
+++ b/packages/tasks/src/tasks/fill-mask/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/fill-mask/output.json",
+	"$id": "/inference/schemas/fill-mask/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Fill Mask task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
index a9d09224b..875fae0e0 100644
--- a/packages/tasks/src/tasks/image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/image-classification/input.json",
+	"$id": "/inference/schemas/image-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Image Classification inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/ImageClassificationParameters"
+			"$ref": "#/$defs/ImageClassificationParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"ImageClassificationParameters": {
+			"title": "ImageClassificationParameters",
 			"description": "Additional inference parameters for Image Classification",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/image-classification/spec/output.json b/packages/tasks/src/tasks/image-classification/spec/output.json
index f48dc3e77..da8a2a5c7 100644
--- a/packages/tasks/src/tasks/image-classification/spec/output.json
+++ b/packages/tasks/src/tasks/image-classification/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/image-classification/output.json",
+	"$id": "/inference/schemas/image-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image Classification task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/input.json b/packages/tasks/src/tasks/image-segmentation/spec/input.json
index 4063d6619..5e050b8c7 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/input.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/image-segmentation/input.json",
+	"$id": "/inference/schemas/image-segmentation/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Image Segmentation inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/ImageSegmentationParameters"
+			"$ref": "#/$defs/ImageSegmentationParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"ImageSegmentationParameters": {
+			"title": "ImageSegmentationParameters",
 			"description": "Additional inference parameters for Image Segmentation",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/output.json b/packages/tasks/src/tasks/image-segmentation/spec/output.json
index 694abf493..80db732e3 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/output.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/image-segmentation/output.json",
+	"$id": "/inference/schemas/image-segmentation/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image Segmentation task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/image-to-image/spec/input.json b/packages/tasks/src/tasks/image-to-image/spec/input.json
index 2d2978c3a..38b1202ef 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/image-to-image/input.json",
+	"$id": "/inference/schemas/image-to-image/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Image To Image inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/ImageToImageParameters"
+			"$ref": "#/$defs/ImageToImageParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"ImageToImageParameters": {
+			"title": "ImageToImageParameters",
 			"description": "Additional inference parameters for Image To Image"
 		}
 	},
diff --git a/packages/tasks/src/tasks/image-to-image/spec/output.json b/packages/tasks/src/tasks/image-to-image/spec/output.json
index 0ec41e450..d9c4f9bf2 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/output.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/image-to-image/output.json",
+	"$id": "/inference/schemas/image-to-image/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image To Image task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/image-to-text/spec/input.json b/packages/tasks/src/tasks/image-to-text/spec/input.json
index 405521847..140f9e27e 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/image-to-text/input.json",
+	"$id": "/inference/schemas/image-to-text/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Image To Text inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/ImageToTextParameters"
+			"$ref": "#/$defs/ImageToTextParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"ImageToTextParameters": {
+			"title": "ImageToTextParameters",
 			"description": "Additional inference parameters for Image To Text",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/image-to-text/spec/output.json b/packages/tasks/src/tasks/image-to-text/spec/output.json
index 0c0392b50..81960cd22 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/output.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/image-to-text/output.json",
+	"$id": "/inference/schemas/image-to-text/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image To Text task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index ebcd5eeb1..1a7785805 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -40,7 +40,7 @@ export interface ObjectDetectionOutput {
 	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
 	 * image.
 	 */
-	box: BoundingBox;
+	box: Box;
 	/**
 	 * The predicted label for the bounding box
 	 */
@@ -56,7 +56,7 @@ export interface ObjectDetectionOutput {
  * The predicted bounding box. Coordinates are relative to the top left corner of the input
  * image.
  */
-export interface BoundingBox {
+export interface Box {
 	xmax: number;
 	xmin: number;
 	ymax: number;
diff --git a/packages/tasks/src/tasks/object-detection/spec/input.json b/packages/tasks/src/tasks/object-detection/spec/input.json
index 7698570f6..f8647e78a 100644
--- a/packages/tasks/src/tasks/object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/object-detection/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/object-detection/input.json",
+	"$id": "/inference/schemas/object-detection/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Object Detection inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/ObjectDetectionParameters"
+			"$ref": "#/$defs/ObjectDetectionParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"ObjectDetectionParameters": {
+			"title": "ObjectDetectionParameters",
 			"description": "Additional inference parameters for Object Detection",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/object-detection/spec/output.json b/packages/tasks/src/tasks/object-detection/spec/output.json
index ef46a2265..41d0ed887 100644
--- a/packages/tasks/src/tasks/object-detection/spec/output.json
+++ b/packages/tasks/src/tasks/object-detection/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/object-detection/output.json",
+	"$id": "/inference/schemas/object-detection/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Object Detection task",
 	"type": "array",
@@ -15,13 +15,13 @@
 				"description": "The associated score / probability"
 			},
 			"box": {
-				"$ref": "#/definitions/BoundingBox",
+				"$ref": "#/$defs/BoundingBox",
 				"description": "The predicted bounding box. Coordinates are relative to the top left corner of the input image."
 			}
 		},
 		"required": ["box", "label", "score"]
 	},
-	"definitions": {
+	"$defs": {
 		"BoundingBox": {
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
index c5df0be04..493c4b7e5 100644
--- a/packages/tasks/src/tasks/question-answering/inference.ts
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -11,7 +11,7 @@ export interface QuestionAnsweringInput {
 	/**
 	 * One or several question+context pairs to answer
 	 */
-	inputs: QuestionAnsweringInputElement[] | QuestionAnsweringInputElement;
+	inputs: SquadExample[] | SquadExample;
 	/**
 	 * Additional inference parameters
 	 */
@@ -19,7 +19,7 @@ export interface QuestionAnsweringInput {
 	[property: string]: any;
 }
 
-export interface QuestionAnsweringInputElement {
+export interface SquadExample {
 	/**
 	 * The context to be used for answering the question
 	 */
diff --git a/packages/tasks/src/tasks/question-answering/spec/input.json b/packages/tasks/src/tasks/question-answering/spec/input.json
index 9f1737f03..9eab32e13 100644
--- a/packages/tasks/src/tasks/question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/question-answering/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/question-answering/input.json",
+	"$id": "/inference/schemas/question-answering/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Question Answering inference",
 	"type": "object",
@@ -8,23 +8,24 @@
 			"description": "One or several question+context pairs to answer",
 			"anyOf": [
 				{
-					"$ref": "#/definitions/QuestionAnsweringInput"
+					"$ref": "#/$defs/SquadExample"
 				},
 				{
 					"type": "array",
 					"items": {
-						"$ref": "#/definitions/QuestionAnsweringInput"
+						"$ref": "#/$defs/SquadExample"
 					}
 				}
 			]
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/QuestionAnsweringParameters"
+			"$ref": "#/$defs/QuestionAnsweringParameters"
 		}
 	},
-	"definitions": {
-		"QuestionAnsweringInput": {
+	"$defs": {
+		"SquadExample": {
+			"title": "SquadExample",
 			"type": "object",
 			"properties": {
 				"question": {
@@ -39,6 +40,7 @@
 			"required": ["question", "context"]
 		},
 		"QuestionAnsweringParameters": {
+			"title": "QuestionAnsweringParameters",
 			"description": "Additional inference parameters for Question Answering",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/question-answering/spec/output.json b/packages/tasks/src/tasks/question-answering/spec/output.json
index eea7e8e51..9da8f988a 100644
--- a/packages/tasks/src/tasks/question-answering/spec/output.json
+++ b/packages/tasks/src/tasks/question-answering/spec/output.json
@@ -1,6 +1,7 @@
 {
-	"id": "http://huggingface.co/inference/schemas/question-answering/output.json",
+	"$id": "/inference/schemas/question-answering/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "QuestionAnsweringOutput",
 	"description": "Outputs of inference for the Question Answering task",
 	"type": "array",
 	"items": {
diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
index e4464c748..1b6579d16 100644
--- a/packages/tasks/src/tasks/summarization/inference.ts
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -17,7 +17,7 @@ export interface SummarizationInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: Parameters;
+	parameters?: Text2TextGenerationParameters;
 	[property: string]: any;
 }
 
@@ -26,7 +26,7 @@ export interface SummarizationInput {
  *
  * Additional inference parameters for Text2text Generation
  */
-export interface Parameters {
+export interface Text2TextGenerationParameters {
 	/**
 	 * Whether to clean up the potential extra spaces in the text output.
 	 */
@@ -34,7 +34,7 @@ export interface Parameters {
 	/**
 	 * Additional parametrization of the text generation algorithm
 	 */
-	generateParameters?: { [key: string]: any };
+	parameters?: { [key: string]: any };
 	/**
 	 * The truncation strategy to use
 	 */
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/input.json b/packages/tasks/src/tasks/table-question-answering/spec/input.json
index 8d35bcf71..aa7c7231f 100644
--- a/packages/tasks/src/tasks/table-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/table-question-answering/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/table-question-answering/input.json",
+	"$id": "/inference/schemas/table-question-answering/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Table Question Answering inference",
 	"type": "object",
@@ -30,11 +30,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/TableQuestionAnsweringParameters"
+			"$ref": "#/$defs/TableQuestionAnsweringParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"TableQuestionAnsweringParameters": {
+			"title": "TableQuestionAnsweringParameters",
 			"description": "Additional inference parameters for Table Question Answering",
 			"type": "object",
 			"properties": {}
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/output.json b/packages/tasks/src/tasks/table-question-answering/spec/output.json
index bb7969e3b..864900647 100644
--- a/packages/tasks/src/tasks/table-question-answering/spec/output.json
+++ b/packages/tasks/src/tasks/table-question-answering/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/table-question-answering/output.json",
+	"$id": "/inference/schemas/table-question-answering/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Table Question Answering task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
index b3b44d2ab..af40fea2e 100644
--- a/packages/tasks/src/tasks/text-classification/spec/input.json
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/text-classification/input.json",
+	"$id": "/inference/schemas/text-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text Classification inference",
 	"type": "object",
@@ -20,11 +20,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/TextClassificationParameters"
+			"$ref": "#/$defs/TextClassificationParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"TextClassificationParameters": {
+			"title": "TextClassificationParameters",
 			"description": "Additional inference parameters for Text Classification",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/text-classification/spec/output.json b/packages/tasks/src/tasks/text-classification/spec/output.json
index 1c317ed02..4e6d69ed9 100644
--- a/packages/tasks/src/tasks/text-classification/spec/output.json
+++ b/packages/tasks/src/tasks/text-classification/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/text-classification/output.json",
+	"$id": "/inference/schemas/text-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Text Classification task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/text-generation/spec/input.json b/packages/tasks/src/tasks/text-generation/spec/input.json
index 08f038702..9b5d3d08e 100644
--- a/packages/tasks/src/tasks/text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text-generation/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/text-generation/input.json",
+	"$id": "/inference/schemas/text-generation/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text Generation inference",
 	"type": "object",
@@ -20,11 +20,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/TextGenerationParameters"
+			"$ref": "#/$defs/TextGenerationParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"TextGenerationParameters": {
+			"title": "TextGenerationParameters",
 			"description": "Additional inference parameters for Text Generation",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/text-generation/spec/output.json b/packages/tasks/src/tasks/text-generation/spec/output.json
index ccbeaea20..4f1eb95e5 100644
--- a/packages/tasks/src/tasks/text-generation/spec/output.json
+++ b/packages/tasks/src/tasks/text-generation/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/text-generation/output.json",
+	"$id": "/inference/schemas/text-generation/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs for Text Generation inference",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/input.json b/packages/tasks/src/tasks/text-to-speech/spec/input.json
index c0e94850e..96febb6fc 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/text-to-audio/input.json",
+	"$id": "/inference/schemas/text-to-audio/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text To Audio inference",
 	"type": "object",
@@ -20,11 +20,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/TextToAudioParameters"
+			"$ref": "#/$defs/TextToAudioParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"TextToAudioParameters": {
+			"title": "TextToAudioParameters",
 			"description": "Additional inference parameters for Text To Audio",
 			"type": "object",
 			"properties": {}
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/output.json b/packages/tasks/src/tasks/text-to-speech/spec/output.json
index e8e92bbf8..f91a9563e 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/output.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/text-to-audio/output.json",
+	"$id": "/inference/schemas/text-to-audio/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Text To Audio task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index 5f144702f..48dd088db 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -15,7 +15,7 @@ export interface Text2TextGenerationInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: Parameters;
+	parameters?: Text2TextGenerationParameters;
 	[property: string]: any;
 }
 
@@ -24,7 +24,7 @@ export interface Text2TextGenerationInput {
  *
  * Additional inference parameters for Text2text Generation
  */
-export interface Parameters {
+export interface Text2TextGenerationParameters {
 	/**
 	 * Whether to clean up the potential extra spaces in the text output.
 	 */
@@ -32,7 +32,7 @@ export interface Parameters {
 	/**
 	 * Additional parametrization of the text generation algorithm
 	 */
-	generateParameters?: { [key: string]: any };
+	parameters?: { [key: string]: any };
 	/**
 	 * The truncation strategy to use
 	 */
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/input.json b/packages/tasks/src/tasks/text2text-generation/spec/input.json
index 1b7077c65..bec8fedfc 100644
--- a/packages/tasks/src/tasks/text2text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text2text-generation/spec/input.json
@@ -25,6 +25,7 @@
 	},
 	"$defs": {
 		"Text2textGenerationParameters": {
+			"title": "Text2textGenerationParameters",
 			"description": "Additional inference parameters for Text2text Generation",
 			"type": "object",
 			"properties": {
@@ -50,7 +51,8 @@
 						}
 					]
 				},
-				"generateParameters": {
+				"Parameters": {
+					"title": "generateParameters",
 					"type": "object",
 					"description": "Additional parametrization of the text generation algorithm"
 				}
diff --git a/packages/tasks/src/tasks/token-classification/spec/input.json b/packages/tasks/src/tasks/token-classification/spec/input.json
index d70c71a81..8ca4b07d3 100644
--- a/packages/tasks/src/tasks/token-classification/spec/input.json
+++ b/packages/tasks/src/tasks/token-classification/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/token-classification/input.json",
+	"$id": "/inference/schemas/token-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Token Classification inference",
 	"type": "object",
@@ -20,11 +20,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/TokenClassificationParameters"
+			"$ref": "#/$defs/TokenClassificationParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"TokenClassificationParameters": {
+			"title": "TokenClassificationParameters",
 			"description": "Additional inference parameters for Token Classification",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/token-classification/spec/output.json b/packages/tasks/src/tasks/token-classification/spec/output.json
index 0e9a03768..7685b740b 100644
--- a/packages/tasks/src/tasks/token-classification/spec/output.json
+++ b/packages/tasks/src/tasks/token-classification/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/token-classification/output.json",
+	"$id": "/inference/schemas/token-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Token Classification task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
index db7f74739..7f4d032a1 100644
--- a/packages/tasks/src/tasks/translation/inference.ts
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -17,7 +17,7 @@ export interface TranslationInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: Parameters;
+	parameters?: Text2TextGenerationParameters;
 	[property: string]: any;
 }
 
@@ -26,7 +26,7 @@ export interface TranslationInput {
  *
  * Additional inference parameters for Text2text Generation
  */
-export interface Parameters {
+export interface Text2TextGenerationParameters {
 	/**
 	 * Whether to clean up the potential extra spaces in the text output.
 	 */
@@ -34,7 +34,7 @@ export interface Parameters {
 	/**
 	 * Additional parametrization of the text generation algorithm
 	 */
-	generateParameters?: { [key: string]: any };
+	parameters?: { [key: string]: any };
 	/**
 	 * The truncation strategy to use
 	 */
diff --git a/packages/tasks/src/tasks/video-classification/spec/input.json b/packages/tasks/src/tasks/video-classification/spec/input.json
index 9f58d0bf3..91b9f9642 100644
--- a/packages/tasks/src/tasks/video-classification/spec/input.json
+++ b/packages/tasks/src/tasks/video-classification/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/video-classification/input.json",
+	"$id": "/inference/schemas/video-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Video Classification inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/VideoClassificationParameters"
+			"$ref": "#/$defs/VideoClassificationParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"VideoClassificationParameters": {
+			"title": "VideoClassificationParameters",
 			"description": "Additional inference parameters for Video Classification",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/video-classification/spec/output.json b/packages/tasks/src/tasks/video-classification/spec/output.json
index aa4e369f1..7121e472f 100644
--- a/packages/tasks/src/tasks/video-classification/spec/output.json
+++ b/packages/tasks/src/tasks/video-classification/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/video-classification/output.json",
+	"$id": "/inference/schemas/video-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Video Classification task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
index 14dc539e9..05a57db48 100644
--- a/packages/tasks/src/tasks/visual-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -11,7 +11,7 @@ export interface VisualQuestionAnsweringInput {
 	/**
 	 * One or more image-question pairs
 	 */
-	inputs: VisualQuestionAnsweringInputElement[] | VisualQuestionAnsweringInputElement;
+	inputs: VisualQuestionAnsweringInputSingle[] | VisualQuestionAnsweringInputSingle;
 	/**
 	 * Additional inference parameters
 	 */
@@ -19,7 +19,7 @@ export interface VisualQuestionAnsweringInput {
 	[property: string]: any;
 }
 
-export interface VisualQuestionAnsweringInputElement {
+export interface VisualQuestionAnsweringInputSingle {
 	/**
 	 * The image.
 	 */
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/input.json b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
index 134351b6b..cc6e5d93a 100644
--- a/packages/tasks/src/tasks/visual-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/visual-question-answering/input.json",
+	"$id": "/inference/schemas/visual-question-answering/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Visual Question Answering inference",
 	"type": "object",
@@ -8,24 +8,25 @@
 			"description": "One or more image-question pairs",
 			"anyOf": [
 				{
-					"$ref": "#/definitions/VisualQuestionAnsweringInput"
+					"$ref": "#/$defs/VisualQuestionAnsweringInputSingle"
 				},
 				{
 					"type": "array",
 					"items": {
-						"$ref": "#/definitions/VisualQuestionAnsweringInput"
+						"$ref": "#/$defs/VisualQuestionAnsweringInputSingle"
 					}
 				}
 			]
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/VisualQuestionAnsweringParameters"
+			"$ref": "#/$defs/VisualQuestionAnsweringParameters"
 		}
 	},
-	"definitions": {
-		"VisualQuestionAnsweringInput": {
+	"$defs": {
+		"VisualQuestionAnsweringInputSingle": {
 			"type": "object",
+			"title": "VisualQuestionAnsweringInputSingle",
 			"properties": {
 				"image": {
 					"description": "The image."
@@ -37,6 +38,7 @@
 			"required": ["question", "image"]
 		},
 		"VisualQuestionAnsweringParameters": {
+			"title": "VisualQuestionAnsweringParameters",
 			"description": "Additional inference parameters for Visual Question Answering",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/output.json b/packages/tasks/src/tasks/visual-question-answering/spec/output.json
index 808957434..2005d9f2f 100644
--- a/packages/tasks/src/tasks/visual-question-answering/spec/output.json
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/visual-question-answering/output.json",
+	"$id": "/inference/schemas/visual-question-answering/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Visual Question Answering task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
index 0908a599b..bc4497098 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -11,7 +11,7 @@ export interface ZeroShotClassificationInput {
 	/**
 	 * One or several text + candidate labels pairs to classify
 	 */
-	inputs: ZeroShotClassificationInputElement[] | ZeroShotClassificationInputElement;
+	inputs: ZeroShotClassificationInputSingle[] | ZeroShotClassificationInputSingle;
 	/**
 	 * Additional inference parameters
 	 */
@@ -19,7 +19,7 @@ export interface ZeroShotClassificationInput {
 	[property: string]: any;
 }
 
-export interface ZeroShotClassificationInputElement {
+export interface ZeroShotClassificationInputSingle {
 	/**
 	 * The set of possible class labels to classify the text into.
 	 */
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
index 3682a9ddf..e573f6817 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/zero-shot-classification/input.json",
+	"$id": "/inference/schemas/zero-shot-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Zero Shot Classification inference",
 	"type": "object",
@@ -8,24 +8,25 @@
 			"description": "One or several text + candidate labels pairs to classify",
 			"anyOf": [
 				{
-					"$ref": "#/definitions/ZeroShotClassificationInput"
+					"$ref": "#/$defs/ZeroShotClassificationInputSingle"
 				},
 				{
 					"type": "array",
 					"items": {
-						"$ref": "#/definitions/ZeroShotClassificationInput"
+						"$ref": "#/$defs/ZeroShotClassificationInputSingle"
 					}
 				}
 			]
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/ZeroShotClassificationParameters"
+			"$ref": "#/$defs/ZeroShotClassificationParameters"
 		}
 	},
-	"definitions": {
-		"ZeroShotClassificationInput": {
+	"$defs": {
+		"ZeroShotClassificationInputSingle": {
 			"type": "object",
+			"title": "ZeroShotClassificationInputSingle",
 			"properties": {
 				"text": {
 					"type": "string",
@@ -42,6 +43,7 @@
 			"required": ["text", "candidateLabels"]
 		},
 		"ZeroShotClassificationParameters": {
+			"title": "ZeroShotClassificationParameters",
 			"description": "Additional inference parameters for Zero Shot Classification",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
index 478e0bef2..54f226d9d 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/zero-shot-classification/output.json",
+	"$id": "/inference/schemas/zero-shot-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Zero Shot Classification task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 46e7860c3..4ae4ff04e 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -11,7 +11,7 @@ export interface ZeroShotImageClassificationInput {
 	/**
 	 * One or several images to classify
 	 */
-	inputs: ZeroShotImageClassificationInputElement[] | ZeroShotImageClassificationInputElement;
+	inputs: ZeroShotImageClassificationInputSingle[] | ZeroShotImageClassificationInputSingle;
 	/**
 	 * Additional inference parameters
 	 */
@@ -19,7 +19,7 @@ export interface ZeroShotImageClassificationInput {
 	[property: string]: any;
 }
 
-export interface ZeroShotImageClassificationInputElement {
+export interface ZeroShotImageClassificationInputSingle {
 	/**
 	 * The candidate labels for this image
 	 */
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
index 4e60af72e..029b19b2d 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/zero-shot-image-classification/input.json",
+	"$id": "/inference/schemas/zero-shot-image-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Zero Shot Image Classification inference",
 	"type": "object",
@@ -8,24 +8,25 @@
 			"description": "One or several images to classify",
 			"anyOf": [
 				{
-					"$ref": "#/definitions/ZeroShotImageClassificationInput"
+					"$ref": "#/$defs/ZeroShotImageClassificationInputSingle"
 				},
 				{
 					"type": "array",
 					"items": {
-						"$ref": "#/definitions/ZeroShotImageClassificationInput"
+						"$ref": "#/$defs/ZeroShotImageClassificationInputSingle"
 					}
 				}
 			]
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/ZeroShotImageClassificationParameters"
+			"$ref": "#/$defs/ZeroShotImageClassificationParameters"
 		}
 	},
-	"definitions": {
-		"ZeroShotImageClassificationInput": {
+	"$defs": {
+		"ZeroShotImageClassificationInputSingle": {
 			"type": "object",
+			"title": "ZeroShotImageClassificationInputSingle",
 			"properties": {
 				"image": {
 					"description": "The image data to classify"
@@ -41,6 +42,7 @@
 			"required": ["image", "candidateLabels"]
 		},
 		"ZeroShotImageClassificationParameters": {
+			"title": "ZeroShotImageClassificationParameters",
 			"description": "Additional inference parameters for Zero Shot Image Classification",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
index a400d6622..102944ebc 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/zero-shot-image-classification/output.json",
+	"$id": "/inference/schemas/zero-shot-image-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Zero Shot Image Classification task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index fe227f31f..64162ae7c 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -11,7 +11,7 @@ export interface ZeroShotObjectDetectionInput {
 	/**
 	 * One or several images to perform object detection on
 	 */
-	inputs: ZeroShotObjectDetectionInputs[] | ZeroShotObjectDetectionInputs;
+	inputs: ZeroShotObjectDetectionInputSingle[] | ZeroShotObjectDetectionInputSingle;
 	/**
 	 * Additional inference parameters
 	 */
@@ -19,7 +19,7 @@ export interface ZeroShotObjectDetectionInput {
 	[property: string]: any;
 }
 
-export interface ZeroShotObjectDetectionInputs {
+export interface ZeroShotObjectDetectionInputSingle {
 	/**
 	 * The candidate labels for this image
 	 */
@@ -39,7 +39,7 @@ export interface ZeroShotObjectDetectionOutput {
 	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
 	 * image.
 	 */
-	box: BoundingBox;
+	box: Box;
 	/**
 	 * A candidate label
 	 */
@@ -55,7 +55,7 @@ export interface ZeroShotObjectDetectionOutput {
  * The predicted bounding box. Coordinates are relative to the top left corner of the input
  * image.
  */
-export interface BoundingBox {
+export interface Box {
 	xmax: number;
 	xmin: number;
 	ymax: number;
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
index 93e95f25f..f2929226b 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/zero-shot-object-detection/input.json",
+	"$id": "/inference/schemas/zero-shot-object-detection/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Zero Shot Object Detection inference",
 	"type": "object",
@@ -8,24 +8,25 @@
 			"description": "One or several images to perform object detection on",
 			"anyOf": [
 				{
-					"$ref": "#/definitions/ZeroShotObjectDetectionInputs"
+					"$ref": "#/$defs/ZeroShotObjectDetectionInputSingle"
 				},
 				{
 					"type": "array",
 					"items": {
-						"$ref": "#/definitions/ZeroShotObjectDetectionInputs"
+						"$ref": "#/$defs/ZeroShotObjectDetectionInputSingle"
 					}
 				}
 			]
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/ZeroShotObjectDetectionParameters"
+			"$ref": "#/$defs/ZeroShotObjectDetectionParameters"
 		}
 	},
-	"definitions": {
-		"ZeroShotObjectDetectionInputs": {
+	"$defs": {
+		"ZeroShotObjectDetectionInputSingle": {
 			"type": "object",
+			"title": "ZeroShotObjectDetectionInputSingle",
 			"properties": {
 				"image": {
 					"description": "The image data to generate bounding boxes from"
@@ -41,6 +42,7 @@
 			"required": ["image", "candidateLabels"]
 		},
 		"ZeroShotObjectDetectionParameters": {
+			"title": "ZeroShotObjectDetectionParameters",
 			"description": "Additional inference parameters for Zero Shot Object Detection",
 			"type": "object",
 			"properties": {}
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
index c5fd05eb3..0e725af9e 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/zero-shot-object-detection/output.json",
+	"$id": "/inference/schemas/zero-shot-object-detection/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Zero Shot Object Detection task",
 	"type": "array",
@@ -15,13 +15,13 @@
 				"description": "The associated score / probability"
 			},
 			"box": {
-				"$ref": "#/definitions/BoundingBox",
+				"$ref": "#/$defs/BoundingBox",
 				"description": "The predicted bounding box. Coordinates are relative to the top left corner of the input image."
 			}
 		},
 		"required": ["box", "label", "score"]
 	},
-	"definitions": {
+	"$defs": {
 		"BoundingBox": {
 			"type": "object",
 			"properties": {

From d027115cd2f64a324be5e90f787d25c2c7054a99 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 18:46:58 +0100
Subject: [PATCH 15/51] =?UTF-8?q?=E2=9C=A8=20Add=20sentence=20similarity?=
 =?UTF-8?q?=20task=20spec?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tasks/sentence-similarity/inference.ts    | 30 +++++++++++
 .../tasks/sentence-similarity/spec/input.json | 52 +++++++++++++++++++
 .../sentence-similarity/spec/output.json      | 11 ++++
 3 files changed, 93 insertions(+)
 create mode 100644 packages/tasks/src/tasks/sentence-similarity/inference.ts
 create mode 100644 packages/tasks/src/tasks/sentence-similarity/spec/input.json
 create mode 100644 packages/tasks/src/tasks/sentence-similarity/spec/output.json

diff --git a/packages/tasks/src/tasks/sentence-similarity/inference.ts b/packages/tasks/src/tasks/sentence-similarity/inference.ts
new file mode 100644
index 000000000..43dcc4777
--- /dev/null
+++ b/packages/tasks/src/tasks/sentence-similarity/inference.ts
@@ -0,0 +1,30 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Sentence similarity inference
+ */
+export interface SentenceSimilarityInput {
+	inputs: SentenceSimilarityInputSingle[] | SentenceSimilarityInputSingle;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: any };
+	[property: string]: any;
+}
+
+export interface SentenceSimilarityInputSingle {
+	/**
+	 * A list of strings which will be compared against the source_sentence.
+	 */
+	sentences: string[];
+	/**
+	 * The string that you wish to compare the other strings with. This can be a phrase,
+	 * sentence, or longer passage, depending on the model being used.
+	 */
+	sourceSentence: string;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/sentence-similarity/spec/input.json b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
new file mode 100644
index 000000000..cfb884abe
--- /dev/null
+++ b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
@@ -0,0 +1,52 @@
+{
+	"$id": "/inference/schemas/sentence-similarity/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Sentence similarity inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"anyOf": [
+				{
+					"$ref": "#/$defs/SentenceSimilarityInputSingle"
+				},
+				{
+					"type": "array",
+					"items": {
+						"$ref": "#/$defs/SentenceSimilarityInputSingle"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/SentenceSimilarityParameters"
+		}
+	},
+	"$defs": {
+		"SentenceSimilarityInputSingle": {
+			"title": "SentenceSimilarityInputSingle",
+			"type": "object",
+			"properties": {
+				"sourceSentence": {
+					"description": "The string that you wish to compare the other strings with. This can be a phrase, sentence, or longer passage, depending on the model being used.",
+					"type": "string"
+				},
+				"sentences": {
+					"type": "array",
+					"description": "A list of strings which will be compared against the source_sentence.",
+					"items": {
+						"type": "string"
+					}
+				}
+			},
+			"required": ["sourceSentence", "sentences"]
+		},
+		"SentenceSimilarityParameters": {
+			"title": "SentenceSimilarityParameters",
+			"description": "Additional inference parameters for Sentence Similarity",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/sentence-similarity/spec/output.json b/packages/tasks/src/tasks/sentence-similarity/spec/output.json
new file mode 100644
index 000000000..e1fc1c9ac
--- /dev/null
+++ b/packages/tasks/src/tasks/sentence-similarity/spec/output.json
@@ -0,0 +1,11 @@
+{
+	"$id": "/inference/schemas/sentence-similarity/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "SentenceSimilarityOutput",
+	"description": "Outputs of inference for the Sentence Similarity task",
+	"type": "array",
+	"items": {
+		"description": "The associated similarity score for each of the given sentences",
+		"type": "number"
+	}
+}

From 224c039a6599e3a4ea674745eb9e386498176435 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 22 Jan 2024 16:03:08 +0100
Subject: [PATCH 16/51] fix typo in text2text-generation spec

---
 .../tasks/src/tasks/text2text-generation/spec/output.json   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/packages/tasks/src/tasks/text2text-generation/spec/output.json b/packages/tasks/src/tasks/text2text-generation/spec/output.json
index 5d6cf0cee..12fe1f3dc 100644
--- a/packages/tasks/src/tasks/text2text-generation/spec/output.json
+++ b/packages/tasks/src/tasks/text2text-generation/spec/output.json
@@ -11,6 +11,8 @@
 				"description": "The generated text."
 			}
 		},
-		"required": ["generatedTex"]
+		"required": [
+			"generatedText"
+		]
 	}
-}
+}
\ No newline at end of file

From b8dae864b9ca519ff479b248c2e6ee535966fbd2 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 22 Jan 2024 16:04:22 +0100
Subject: [PATCH 17/51] regenerate code

---
 packages/tasks/src/tasks/summarization/inference.ts         | 3 +--
 packages/tasks/src/tasks/text2text-generation/inference.ts  | 3 +--
 .../tasks/src/tasks/text2text-generation/spec/output.json   | 6 ++----
 packages/tasks/src/tasks/translation/inference.ts           | 3 +--
 4 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
index 1b6579d16..b2e6272c5 100644
--- a/packages/tasks/src/tasks/summarization/inference.ts
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -50,10 +50,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface SummarizationOutput {
-	generatedTex: any;
 	/**
 	 * The generated text.
 	 */
-	generatedText?: string;
+	generatedText: string;
 	[property: string]: any;
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index 48dd088db..4cd870928 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -46,10 +46,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface Text2TextGenerationOutput {
-	generatedTex: any;
 	/**
 	 * The generated text.
 	 */
-	generatedText?: string;
+	generatedText: string;
 	[property: string]: any;
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/output.json b/packages/tasks/src/tasks/text2text-generation/spec/output.json
index 12fe1f3dc..190aa6014 100644
--- a/packages/tasks/src/tasks/text2text-generation/spec/output.json
+++ b/packages/tasks/src/tasks/text2text-generation/spec/output.json
@@ -11,8 +11,6 @@
 				"description": "The generated text."
 			}
 		},
-		"required": [
-			"generatedText"
-		]
+		"required": ["generatedText"]
 	}
-}
\ No newline at end of file
+}
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
index 7f4d032a1..c4f31b0ea 100644
--- a/packages/tasks/src/tasks/translation/inference.ts
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -50,10 +50,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface TranslationOutput {
-	generatedTex: any;
 	/**
 	 * The generated text.
 	 */
-	generatedText?: string;
+	generatedText: string;
 	[property: string]: any;
 }

From b84825ee03aa7970409ec1dc901ed508e8cd6bd3 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 22 Jan 2024 16:09:38 +0100
Subject: [PATCH 18/51] Have text-to-speech refer to text-to-audio

---
 .../src/tasks/text-to-audio/spec/input.json   | 37 +++++++++++++++++++
 .../src/tasks/text-to-audio/spec/output.json  | 22 +++++++++++
 .../src/tasks/text-to-speech/spec/input.json  | 37 ++-----------------
 .../src/tasks/text-to-speech/spec/output.json | 21 ++---------
 .../src/tasks/text2text-generation/about.md   | 15 --------
 .../src/tasks/text2text-generation/data.ts    | 18 ---------
 6 files changed, 67 insertions(+), 83 deletions(-)
 create mode 100644 packages/tasks/src/tasks/text-to-audio/spec/input.json
 create mode 100644 packages/tasks/src/tasks/text-to-audio/spec/output.json
 delete mode 100644 packages/tasks/src/tasks/text2text-generation/about.md
 delete mode 100644 packages/tasks/src/tasks/text2text-generation/data.ts

diff --git a/packages/tasks/src/tasks/text-to-audio/spec/input.json b/packages/tasks/src/tasks/text-to-audio/spec/input.json
new file mode 100644
index 000000000..2ad196f5f
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-audio/spec/input.json
@@ -0,0 +1,37 @@
+{
+    "$id": "/inference/schemas/text-to-audio/input.json",
+    "$schema": "http://json-schema.org/draft-06/schema#",
+    "description": "Inputs for Text To Audio inference",
+    "type": "object",
+    "properties": {
+        "inputs": {
+            "description": "One or several texts to generate audio for",
+            "anyOf": [
+                {
+                    "type": "string"
+                },
+                {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                }
+            ]
+        },
+        "parameters": {
+            "description": "Additional inference parameters",
+            "$ref": "#/$defs/TextToAudioParameters"
+        }
+    },
+    "$defs": {
+        "TextToAudioParameters": {
+            "title": "TextToAudioParameters",
+            "description": "Additional inference parameters for Text To Audio",
+            "type": "object",
+            "properties": {}
+        }
+    },
+    "required": [
+        "inputs"
+    ]
+}
\ No newline at end of file
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/output.json b/packages/tasks/src/tasks/text-to-audio/spec/output.json
new file mode 100644
index 000000000..c66555117
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-audio/spec/output.json
@@ -0,0 +1,22 @@
+{
+    "$id": "/inference/schemas/text-to-audio/output.json",
+    "$schema": "http://json-schema.org/draft-06/schema#",
+    "description": "Outputs of inference for the Text To Audio task",
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "audio": {
+                "description": "The generated audio waveform."
+            },
+            "samplingRate": {
+                "type": "number",
+                "description": "The sampling rate of the generated audio waveform."
+            }
+        },
+        "required": [
+            "audio",
+            "samplingRate"
+        ]
+    }
+}
\ No newline at end of file
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/input.json b/packages/tasks/src/tasks/text-to-speech/spec/input.json
index 96febb6fc..533c7d02d 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/input.json
@@ -1,35 +1,6 @@
 {
-	"$id": "/inference/schemas/text-to-audio/input.json",
+	"$ref": "/inference/schemas/text-to-audio/input.json",
+	"$id": "/inference/schemas/text-to-speech/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "Inputs for Text To Audio inference",
-	"type": "object",
-	"properties": {
-		"inputs": {
-			"description": "One or several texts to generate audio for",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
-		},
-		"parameters": {
-			"description": "Additional inference parameters",
-			"$ref": "#/$defs/TextToAudioParameters"
-		}
-	},
-	"$defs": {
-		"TextToAudioParameters": {
-			"title": "TextToAudioParameters",
-			"description": "Additional inference parameters for Text To Audio",
-			"type": "object",
-			"properties": {}
-		}
-	},
-	"required": ["inputs"]
-}
+	"description": "Inputs for Text to Speech inference"
+}
\ No newline at end of file
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/output.json b/packages/tasks/src/tasks/text-to-speech/spec/output.json
index f91a9563e..1b591393f 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/output.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/output.json
@@ -1,19 +1,6 @@
 {
-	"$id": "/inference/schemas/text-to-audio/output.json",
+	"$ref": "/inference/schemas/text-to-audio/output.json",
+	"$id": "/inference/schemas/text-to-speech/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "Outputs of inference for the Text To Audio task",
-	"type": "array",
-	"items": {
-		"type": "object",
-		"properties": {
-			"audio": {
-				"description": "The generated audio waveform."
-			},
-			"samplingRate": {
-				"type": "number",
-				"description": "The sampling rate of the generated audio waveform."
-			}
-		},
-		"required": ["audio", "samplingRate"]
-	}
-}
+	"description": "Outputs for Text to Speech inference"
+}
\ No newline at end of file
diff --git a/packages/tasks/src/tasks/text2text-generation/about.md b/packages/tasks/src/tasks/text2text-generation/about.md
deleted file mode 100644
index fdb455844..000000000
--- a/packages/tasks/src/tasks/text2text-generation/about.md
+++ /dev/null
@@ -1,15 +0,0 @@
-## Use Cases
-
-You can contribute this area with common use cases of the task!
-
-## Task Variants
-
-This place can be filled with variants of this task if there's any.
-
-## Inference
-
-This section should have useful information about how to pull a model from Hugging Face Hub that is a part of a library specialized in a task and use it.
-
-## Useful Resources
-
-In this area, you can insert useful resources about how to train or use a model for this task.
diff --git a/packages/tasks/src/tasks/text2text-generation/data.ts b/packages/tasks/src/tasks/text2text-generation/data.ts
deleted file mode 100644
index 7a7097e59..000000000
--- a/packages/tasks/src/tasks/text2text-generation/data.ts
+++ /dev/null
@@ -1,18 +0,0 @@
-import type { TaskDataCustom } from "..";
-
-const taskData: TaskDataCustom = {
-	datasets: [],
-	demo: {
-		inputs: [],
-		outputs: [],
-	},
-	isPlaceholder: false,
-	metrics: [],
-	models: [],
-	spaces: [],
-	summary: "",
-	widgetModels: [],
-	youtubeId: undefined,
-};
-
-export default taskData;

From 4484e394496c558d9f7a90803e9664c85c7eccaa Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 22 Jan 2024 16:10:39 +0100
Subject: [PATCH 19/51] regenerate code

---
 .../src/tasks/text-to-audio/inference.ts      | 35 ++++++++++
 .../src/tasks/text-to-audio/spec/input.json   | 70 +++++++++----------
 .../src/tasks/text-to-audio/spec/output.json  | 39 +++++------
 .../src/tasks/text-to-speech/inference.ts     |  4 ++
 .../src/tasks/text-to-speech/spec/input.json  |  2 +-
 .../src/tasks/text-to-speech/spec/output.json |  2 +-
 6 files changed, 93 insertions(+), 59 deletions(-)
 create mode 100644 packages/tasks/src/tasks/text-to-audio/inference.ts

diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
new file mode 100644
index 000000000..3916184bd
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -0,0 +1,35 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Text To Audio inference
+ */
+export interface TextToAudioInput {
+	/**
+	 * One or several texts to generate audio for
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: any };
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Text To Audio task
+ */
+export interface TextToAudioOutput {
+	/**
+	 * The generated audio waveform.
+	 */
+	audio: any;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	samplingRate: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/input.json b/packages/tasks/src/tasks/text-to-audio/spec/input.json
index 2ad196f5f..96febb6fc 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/input.json
@@ -1,37 +1,35 @@
 {
-    "$id": "/inference/schemas/text-to-audio/input.json",
-    "$schema": "http://json-schema.org/draft-06/schema#",
-    "description": "Inputs for Text To Audio inference",
-    "type": "object",
-    "properties": {
-        "inputs": {
-            "description": "One or several texts to generate audio for",
-            "anyOf": [
-                {
-                    "type": "string"
-                },
-                {
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                }
-            ]
-        },
-        "parameters": {
-            "description": "Additional inference parameters",
-            "$ref": "#/$defs/TextToAudioParameters"
-        }
-    },
-    "$defs": {
-        "TextToAudioParameters": {
-            "title": "TextToAudioParameters",
-            "description": "Additional inference parameters for Text To Audio",
-            "type": "object",
-            "properties": {}
-        }
-    },
-    "required": [
-        "inputs"
-    ]
-}
\ No newline at end of file
+	"$id": "/inference/schemas/text-to-audio/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text To Audio inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several texts to generate audio for",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/TextToAudioParameters"
+		}
+	},
+	"$defs": {
+		"TextToAudioParameters": {
+			"title": "TextToAudioParameters",
+			"description": "Additional inference parameters for Text To Audio",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/output.json b/packages/tasks/src/tasks/text-to-audio/spec/output.json
index c66555117..f91a9563e 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/output.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/output.json
@@ -1,22 +1,19 @@
 {
-    "$id": "/inference/schemas/text-to-audio/output.json",
-    "$schema": "http://json-schema.org/draft-06/schema#",
-    "description": "Outputs of inference for the Text To Audio task",
-    "type": "array",
-    "items": {
-        "type": "object",
-        "properties": {
-            "audio": {
-                "description": "The generated audio waveform."
-            },
-            "samplingRate": {
-                "type": "number",
-                "description": "The sampling rate of the generated audio waveform."
-            }
-        },
-        "required": [
-            "audio",
-            "samplingRate"
-        ]
-    }
-}
\ No newline at end of file
+	"$id": "/inference/schemas/text-to-audio/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text To Audio task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"audio": {
+				"description": "The generated audio waveform."
+			},
+			"samplingRate": {
+				"type": "number",
+				"description": "The sampling rate of the generated audio waveform."
+			}
+		},
+		"required": ["audio", "samplingRate"]
+	}
+}
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index 75c54a166..137492b27 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -5,6 +5,8 @@
  */
 
 /**
+ * Inputs for Text to Speech inference
+ *
  * Inputs for Text To Audio inference
  */
 export interface TextToSpeechInput {
@@ -20,6 +22,8 @@ export interface TextToSpeechInput {
 }
 
 /**
+ * Outputs for Text to Speech inference
+ *
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToSpeechOutput {
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/input.json b/packages/tasks/src/tasks/text-to-speech/spec/input.json
index 533c7d02d..dffbf7910 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/input.json
@@ -3,4 +3,4 @@
 	"$id": "/inference/schemas/text-to-speech/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text to Speech inference"
-}
\ No newline at end of file
+}
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/output.json b/packages/tasks/src/tasks/text-to-speech/spec/output.json
index 1b591393f..4678592e8 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/output.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/output.json
@@ -3,4 +3,4 @@
 	"$id": "/inference/schemas/text-to-speech/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs for Text to Speech inference"
-}
\ No newline at end of file
+}

From a9c9ae117395348776ae7644affcf9928f1c1228 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 10:57:51 +0100
Subject: [PATCH 20/51] Add quicktype-core from fork

---
 packages/tasks/package.json   |  4 +--
 packages/tasks/pnpm-lock.yaml | 54 ++++++++++++++++++-----------------
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/packages/tasks/package.json b/packages/tasks/package.json
index 258679aba..7077133d0 100644
--- a/packages/tasks/package.json
+++ b/packages/tasks/package.json
@@ -43,6 +43,6 @@
 	"license": "MIT",
 	"devDependencies": {
 		"@types/node": "^20.11.5",
-		"quicktype-core": "^23.0.81"
+		"quicktype-core": "https://github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz"
 	}
-}
+}
\ No newline at end of file
diff --git a/packages/tasks/pnpm-lock.yaml b/packages/tasks/pnpm-lock.yaml
index fedbbb7c3..0ff78350f 100644
--- a/packages/tasks/pnpm-lock.yaml
+++ b/packages/tasks/pnpm-lock.yaml
@@ -9,8 +9,8 @@ devDependencies:
     specifier: ^20.11.5
     version: 20.11.5
   quicktype-core:
-    specifier: ^23.0.81
-    version: 23.0.81
+    specifier: https://github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz
+    version: '@github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz'
 
 packages:
 
@@ -80,8 +80,8 @@ packages:
     resolution: {integrity: sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==}
     dev: true
 
-  /js-base64@3.7.5:
-    resolution: {integrity: sha512-3MEt5DTINKqfScXKfJFrRbxkrnk2AxPWGBL/ycjz4dK8iqiSJ06UxD8jh8xuh6p10TX4t2+7FsBYVxxQbMg+qA==}
+  /js-base64@3.7.6:
+    resolution: {integrity: sha512-NPrWuHFxFUknr1KqJRDgUQPexQF0uIJWjeT+2KjEePhitQxQEx5EJBG1lVn5/hc8aLycTpXrDOgPQ6Zq+EDiTA==}
     dev: true
 
   /lodash@4.17.21:
@@ -118,28 +118,6 @@ packages:
     engines: {node: '>= 0.6.0'}
     dev: true
 
-  /quicktype-core@23.0.81:
-    resolution: {integrity: sha512-iJQpCEzSQIkffJPS5NC+0w+Rq9faGgz09L+WIbseu1toFfj+M/3KTG5jhzdY/uN88fWosAom2fMoEADA403+rQ==}
-    dependencies:
-      '@glideapps/ts-necessities': 2.1.3
-      '@types/urijs': 1.19.25
-      browser-or-node: 2.1.1
-      collection-utils: 1.0.1
-      cross-fetch: 4.0.0
-      is-url: 1.2.4
-      js-base64: 3.7.5
-      lodash: 4.17.21
-      pako: 1.0.11
-      pluralize: 8.0.0
-      readable-stream: 4.4.2
-      unicode-properties: 1.4.1
-      urijs: 1.19.11
-      wordwrap: 1.0.0
-      yaml: 2.3.4
-    transitivePeerDependencies:
-      - encoding
-    dev: true
-
   /readable-stream@4.4.2:
     resolution: {integrity: sha512-Lk/fICSyIhodxy1IDK2HazkeGjSmezAWX2egdtJnYhtzKEsBPJowlI6F6LPb5tqIQILrMbx22S5o3GuJavPusA==}
     engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
@@ -210,3 +188,27 @@ packages:
     resolution: {integrity: sha512-8aAvwVUSHpfEqTQ4w/KMlf3HcRdt50E5ODIQJBw1fQ5RL34xabzxtUlzTXVqc4rkZsPbvrXKWnABCD7kWSmocA==}
     engines: {node: '>= 14'}
     dev: true
+
+  '@github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz':
+    resolution: {tarball: https://github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz}
+    name: quicktype-core
+    version: 18.0.15
+    dependencies:
+      '@glideapps/ts-necessities': 2.1.3
+      '@types/urijs': 1.19.25
+      browser-or-node: 2.1.1
+      collection-utils: 1.0.1
+      cross-fetch: 4.0.0
+      is-url: 1.2.4
+      js-base64: 3.7.6
+      lodash: 4.17.21
+      pako: 1.0.11
+      pluralize: 8.0.0
+      readable-stream: 4.4.2
+      unicode-properties: 1.4.1
+      urijs: 1.19.11
+      wordwrap: 1.0.0
+      yaml: 2.3.4
+    transitivePeerDependencies:
+      - encoding
+    dev: true

From f9fd4f934edd133ff5b6621304a45ccc14b2baba Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 10:58:13 +0100
Subject: [PATCH 21/51] regenerate code

---
 .../tasks/src/scripts/inference-codegen.ts    |   2 +
 .../tasks/audio-classification/inference.ts   |  50 +++---
 .../automatic-speech-recognition/inference.ts |  32 ++--
 .../src/tasks/depth-estimation/inference.ts   |  34 ++--
 .../document-question-answering/inference.ts  | 150 +++++++++---------
 .../src/tasks/feature-extraction/inference.ts |  22 +--
 .../tasks/feature-extraction/spec/output.json |  86 +++++-----
 .../tasks/src/tasks/fill-mask/inference.ts    |  80 +++++-----
 .../tasks/image-classification/inference.ts   |  50 +++---
 .../src/tasks/image-segmentation/inference.ts |  74 ++++-----
 .../src/tasks/image-to-image/inference.ts     |  24 +--
 .../src/tasks/image-to-text/inference.ts      |  42 ++---
 .../src/tasks/object-detection/inference.ts   |  70 ++++----
 .../src/tasks/question-answering/inference.ts | 146 ++++++++---------
 .../tasks/sentence-similarity/inference.ts    |  38 +++--
 .../src/tasks/summarization/inference.ts      |  58 +++----
 .../table-question-answering/inference.ts     |  76 ++++-----
 .../tasks/text-classification/inference.ts    |  58 +++----
 .../src/tasks/text-generation/inference.ts    | 128 +++++++--------
 .../src/tasks/text-to-audio/inference.ts      |  40 ++---
 .../src/tasks/text-to-speech/inference.ts     |  40 ++---
 .../tasks/text2text-generation/inference.ts   |  58 +++----
 .../tasks/token-classification/inference.ts   |  92 +++++------
 .../tasks/src/tasks/translation/inference.ts  |  58 +++----
 .../tasks/video-classification/inference.ts   |  66 ++++----
 .../visual-question-answering/inference.ts    |  74 ++++-----
 .../zero-shot-classification/inference.ts     |  82 +++++-----
 .../inference.ts                              |  70 ++++----
 .../zero-shot-object-detection/inference.ts   |  78 ++++-----
 29 files changed, 966 insertions(+), 912 deletions(-)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index d256c81c5..aa92ba5a4 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -61,6 +61,8 @@ async function generateTypescript(inputData: InputData): Promise<SerializedRende
 			"nice-property-names": true,
 			"prefer-unions": true,
 			"prefer-const-values": true,
+			"prefer-unknown": true,
+			// "explicit-unions": true,
 		},
 	});
 }
diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index a1f068a48..d65ead71b 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Audio Classification inference
  */
 export interface AudioClassificationInput {
-	/**
-	 * On or several audio files to classify
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: AudioClassificationParameters;
-	[property: string]: any;
+    /**
+     * On or several audio files to classify
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: AudioClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,24 +27,24 @@ export interface AudioClassificationInput {
  * Additional inference parameters for Audio Classification
  */
 export interface AudioClassificationParameters {
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: any;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs for Audio Classification inference
  */
 export interface AudioClassificationOutput {
-	/**
-	 * The predicted class label (model specific).
-	 */
-	label: string;
-	/**
-	 * The corresponding probability.
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index 87a78c95b..abaa0caef 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -1,31 +1,33 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Automatic Speech Recognition inference
  */
 export interface AutomaticSpeechRecognitionInput {
-	/**
-	 * The input audio data
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: any };
-	[property: string]: any;
+    /**
+     * The input audio data
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Automatic Speech Recognition task
  */
 export interface AutomaticSpeechRecognitionOutput {
-	/**
-	 * The recognized text.
-	 */
-	text: string;
-	[property: string]: any;
+    /**
+     * The recognized text.
+     */
+    text: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/depth-estimation/inference.ts b/packages/tasks/src/tasks/depth-estimation/inference.ts
index 19fa43c11..ba5975a74 100644
--- a/packages/tasks/src/tasks/depth-estimation/inference.ts
+++ b/packages/tasks/src/tasks/depth-estimation/inference.ts
@@ -1,22 +1,26 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
+export type DepthEstimationOutput = unknown[];
+
 /**
  * Inputs for Depth Estimation inference
  */
 export interface DepthEstimationInput {
-	/**
-	 * The input image data
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: DepthEstimationParameters;
-	[property: string]: any;
+    /**
+     * The input image data
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: DepthEstimationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,9 +29,9 @@ export interface DepthEstimationInput {
  * Additional inference parameters for Depth Estimation
  */
 export interface DepthEstimationParameters {
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: any;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index 5a8eeeb08..6c730e277 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -1,34 +1,36 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Document Question Answering inference
  */
 export interface DocumentQuestionAnsweringInput {
-	/**
-	 * The
-	 */
-	inputs: DocumentQuestionAnsweringInpu[] | DocumentQuestionAnsweringInpu;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: DocumentQuestionAnsweringParameters;
-	[property: string]: any;
+    /**
+     * The
+     */
+    inputs: DocumentQuestionAnsweringInpu[] | DocumentQuestionAnsweringInpu;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: DocumentQuestionAnsweringParameters;
+    [property: string]: unknown;
 }
 
 export interface DocumentQuestionAnsweringInpu {
-	/**
-	 * The image on which the question is asked
-	 */
-	image?: any;
-	/**
-	 * A question to ask of the document
-	 */
-	question?: string;
-	[property: string]: any;
+    /**
+     * The image on which the question is asked
+     */
+    image?: unknown;
+    /**
+     * A question to ask of the document
+     */
+    question?: string;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,65 +39,65 @@ export interface DocumentQuestionAnsweringInpu {
  * Additional inference parameters for Document Question Answering
  */
 export interface DocumentQuestionAnsweringParameters {
-	/**
-	 * If the words in the document are too long to fit with the question for the model, it will
-	 * be split in several chunks with some overlap. This argument controls the size of that
-	 * overlap.
-	 */
-	docStride?: number;
-	/**
-	 * Whether to accept impossible as an answer
-	 */
-	handleImpossibleAnswer?: boolean;
-	/**
-	 * Language to use while running OCR. Defaults to english.
-	 */
-	lang?: string;
-	/**
-	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
-	 * considered).
-	 */
-	maxAnswerLen?: number;
-	/**
-	 * The maximum length of the question after tokenization. It will be truncated if needed.
-	 */
-	maxQuestionLen?: number;
-	/**
-	 * The maximum length of the total sentence (context + question) in tokens of each chunk
-	 * passed to the model. The context will be split in several chunks (using doc_stride as
-	 * overlap) if needed.
-	 */
-	maxSeqLen?: number;
-	/**
-	 * The number of answers to return (will be chosen by order of likelihood). Can return less
-	 * than top_k answers if there are not enough options available within the context.
-	 */
-	topK?: number;
-	/**
-	 * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
-	 * skip the OCR step and use the provided bounding boxes instead.
-	 */
-	wordBoxes?: Array<number[] | string>;
-	[property: string]: any;
+    /**
+     * If the words in the document are too long to fit with the question for the model, it will
+     * be split in several chunks with some overlap. This argument controls the size of that
+     * overlap.
+     */
+    docStride?: number;
+    /**
+     * Whether to accept impossible as an answer
+     */
+    handleImpossibleAnswer?: boolean;
+    /**
+     * Language to use while running OCR. Defaults to english.
+     */
+    lang?: string;
+    /**
+     * The maximum length of predicted answers (e.g., only answers with a shorter length are
+     * considered).
+     */
+    maxAnswerLen?: number;
+    /**
+     * The maximum length of the question after tokenization. It will be truncated if needed.
+     */
+    maxQuestionLen?: number;
+    /**
+     * The maximum length of the total sentence (context + question) in tokens of each chunk
+     * passed to the model. The context will be split in several chunks (using doc_stride as
+     * overlap) if needed.
+     */
+    maxSeqLen?: number;
+    /**
+     * The number of answers to return (will be chosen by order of likelihood). Can return less
+     * than top_k answers if there are not enough options available within the context.
+     */
+    topK?: number;
+    /**
+     * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
+     * skip the OCR step and use the provided bounding boxes instead.
+     */
+    wordBoxes?: Array<number[] | string>;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Document Question Answering task
  */
 export interface DocumentQuestionAnsweringOutput {
-	/**
-	 * The answer to the question.
-	 */
-	answer: string;
-	end: number;
-	/**
-	 * The probability associated to the answer.
-	 */
-	score: number;
-	start: number;
-	/**
-	 * The index of each word/box pair that is in the answer
-	 */
-	words: number[];
-	[property: string]: any;
+    /**
+     * The answer to the question.
+     */
+    answer: string;
+    end:    number;
+    /**
+     * The probability associated to the answer.
+     */
+    score: number;
+    start: number;
+    /**
+     * The index of each word/box pair that is in the answer
+     */
+    words: number[];
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
index b905bc3dc..c6b6dcec5 100644
--- a/packages/tasks/src/tasks/feature-extraction/inference.ts
+++ b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -1,20 +1,22 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Feature Extraction inference
  */
 export interface FeatureExtractionInput {
-	/**
-	 * One or several texts to get the features of
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: any };
-	[property: string]: any;
+    /**
+     * One or several texts to get the features of
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/output.json b/packages/tasks/src/tasks/feature-extraction/spec/output.json
index 4fac04cfe..47303e945 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/output.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/output.json
@@ -3,57 +3,49 @@
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Feature Extraction task",
 	"type": "array",
+	"title": "FeatureExtractionOutput",
 	"items": {
 		"description": "The features computed by the mode, as a nested list of floats",
-		"$ref": "#/$defs/FeatureTensor"
-	},
-	"$defs": {
-		"FeatureTensor": {
-			"title": "FeatureTensor",
-			"type": "array",
-			"items": {
-				"anyOf": [
-					{
-						"type": "number"
-					},
-					{
-						"type": "array",
-						"items": {
-							"anyOf": [
-								{
-									"type": "number"
-								},
-								{
-									"type": "array",
-									"items": {
-										"anyOf": [
-											{
-												"type": "number"
-											},
-											{
-												"type": "array",
-												"items": {
-													"anyOf": [
-														{
-															"type": "number"
-														},
-														{
-															"type": "array",
-															"items": {
-																"type": "number"
-															}
-														}
-													]
+		"anyOf": [
+			{
+				"type": "number"
+			},
+			{
+				"type": "array",
+				"items": {
+					"anyOf": [
+						{
+							"type": "number"
+						},
+						{
+							"type": "array",
+							"items": {
+								"anyOf": [
+									{
+										"type": "number"
+									},
+									{
+										"type": "array",
+										"items": {
+											"anyOf": [
+												{
+													"type": "number"
+												},
+												{
+													"type": "array",
+													"items": {
+														"type": "number"
+													}
 												}
-											}
-										]
+											]
+										}
 									}
-								}
-							]
+								]
+							}
 						}
-					}
-				]
+					]
+				}
 			}
-		}
+		]
 	}
-}
+}
\ No newline at end of file
diff --git a/packages/tasks/src/tasks/fill-mask/inference.ts b/packages/tasks/src/tasks/fill-mask/inference.ts
index c603d73d5..380ae8cd2 100644
--- a/packages/tasks/src/tasks/fill-mask/inference.ts
+++ b/packages/tasks/src/tasks/fill-mask/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Fill Mask inference
  */
 export interface FillMaskInput {
-	/**
-	 * One or several texts with masked tokens
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: FillMaskParameters;
-	[property: string]: any;
+    /**
+     * One or several texts with masked tokens
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: FillMaskParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,39 +27,39 @@ export interface FillMaskInput {
  * Additional inference parameters for Fill Mask
  */
 export interface FillMaskParameters {
-	/**
-	 * When passed, the model will limit the scores to the passed targets instead of looking up
-	 * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
-	 * tokenized and the first resulting token will be used (with a warning, and that might be
-	 * slower).
-	 */
-	targets?: string[] | string;
-	/**
-	 * When passed, overrides the number of predictions to return.
-	 */
-	topK?: number;
-	[property: string]: any;
+    /**
+     * When passed, the model will limit the scores to the passed targets instead of looking up
+     * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
+     * tokenized and the first resulting token will be used (with a warning, and that might be
+     * slower).
+     */
+    targets?: string[] | string;
+    /**
+     * When passed, overrides the number of predictions to return.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Fill Mask task
  */
 export interface FillMaskOutput {
-	/**
-	 * The corresponding probability
-	 */
-	score: number;
-	/**
-	 * The corresponding input with the mask token prediction.
-	 */
-	sequence: string;
-	/**
-	 * The predicted token id (to replace the masked one).
-	 */
-	token: number;
-	/**
-	 * The predicted token (to replace the masked one).
-	 */
-	tokenStr: string;
-	[property: string]: any;
+    /**
+     * The corresponding probability
+     */
+    score: number;
+    /**
+     * The corresponding input with the mask token prediction.
+     */
+    sequence: string;
+    /**
+     * The predicted token id (to replace the masked one).
+     */
+    token: number;
+    /**
+     * The predicted token (to replace the masked one).
+     */
+    tokenStr: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index 6114ff9cc..b6700e06e 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Image Classification inference
  */
 export interface ImageClassificationInput {
-	/**
-	 * On or several image files to classify
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ImageClassificationParameters;
-	[property: string]: any;
+    /**
+     * On or several image files to classify
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ImageClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,24 +27,24 @@ export interface ImageClassificationInput {
  * Additional inference parameters for Image Classification
  */
 export interface ImageClassificationParameters {
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: any;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Image Classification task
  */
 export interface ImageClassificationOutput {
-	/**
-	 * The predicted class label (model specific).
-	 */
-	label: string;
-	/**
-	 * The corresponding probability.
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index b9131b11f..63fae5288 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Image Segmentation inference
  */
 export interface ImageSegmentationInput {
-	/**
-	 * One or several image files to perform segmentation on
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ImageSegmentationParameters;
-	[property: string]: any;
+    /**
+     * One or several image files to perform segmentation on
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ImageSegmentationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,23 +27,23 @@ export interface ImageSegmentationInput {
  * Additional inference parameters for Image Segmentation
  */
 export interface ImageSegmentationParameters {
-	/**
-	 * Threshold to use when turning the predicted masks into binary values.
-	 */
-	maskThreshold?: number;
-	/**
-	 * Mask overlap threshold to eliminate small, disconnected segments.
-	 */
-	overlapMaskAreaThreshold?: number;
-	/**
-	 * Segmentation task to be performed, depending on model capabilities.
-	 */
-	subtask?: Subtask;
-	/**
-	 * Probability threshold to filter out predicted masks.
-	 */
-	threshold?: number;
-	[property: string]: any;
+    /**
+     * Threshold to use when turning the predicted masks into binary values.
+     */
+    maskThreshold?: number;
+    /**
+     * Mask overlap threshold to eliminate small, disconnected segments.
+     */
+    overlapMaskAreaThreshold?: number;
+    /**
+     * Segmentation task to be performed, depending on model capabilities.
+     */
+    subtask?: Subtask;
+    /**
+     * Probability threshold to filter out predicted masks.
+     */
+    threshold?: number;
+    [property: string]: unknown;
 }
 
 export type Subtask = "instance" | "panoptic" | "semantic";
@@ -52,13 +54,13 @@ export type Subtask = "instance" | "panoptic" | "semantic";
  * A predicted mask / segment
  */
 export interface ImageSegmentationOutput {
-	/**
-	 * The label of the predicted segment
-	 */
-	label: string;
-	/**
-	 * The corresponding mask as a black-and-white image
-	 */
-	mask: any;
-	[property: string]: any;
+    /**
+     * The label of the predicted segment
+     */
+    label: string;
+    /**
+     * The corresponding mask as a black-and-white image
+     */
+    mask: unknown;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
index c1c371033..6fb0f997e 100644
--- a/packages/tasks/src/tasks/image-to-image/inference.ts
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -1,20 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
+export type ImageToImageOutput = unknown[];
+
 /**
  * Inputs for Image To Image inference
  */
 export interface ImageToImageInput {
-	/**
-	 * One or more images to generate images from
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: any;
-	[property: string]: any;
+    /**
+     * One or more images to generate images from
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: unknown;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index 917b8bf0a..12e0d4968 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Image To Text inference
  */
 export interface ImageToTextInput {
-	/**
-	 * One or several images to generated text for
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ImageToTextParameters;
-	[property: string]: any;
+    /**
+     * One or several images to generated text for
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ImageToTextParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,20 +27,20 @@ export interface ImageToTextInput {
  * Additional inference parameters for Image To Text
  */
 export interface ImageToTextParameters {
-	/**
-	 * The amount of maximum tokens to generate.
-	 */
-	maxNewTokens?: number;
-	[property: string]: any;
+    /**
+     * The amount of maximum tokens to generate.
+     */
+    maxNewTokens?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Image To Text task
  */
 export interface ImageToTextOutput {
-	/**
-	 * The generated text.
-	 */
-	generatedText: string;
-	[property: string]: any;
+    /**
+     * The generated text.
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index 1a7785805..d294110f7 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Object Detection inference
  */
 export interface ObjectDetectionInput {
-	/**
-	 * One or several input images to perform object detection on
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ObjectDetectionParameters;
-	[property: string]: any;
+    /**
+     * One or several input images to perform object detection on
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ObjectDetectionParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,31 +27,31 @@ export interface ObjectDetectionInput {
  * Additional inference parameters for Object Detection
  */
 export interface ObjectDetectionParameters {
-	/**
-	 * The probability necessary to make a prediction.
-	 */
-	threshold?: number;
-	[property: string]: any;
+    /**
+     * The probability necessary to make a prediction.
+     */
+    threshold?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Object Detection task
  */
 export interface ObjectDetectionOutput {
-	/**
-	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
-	 * image.
-	 */
-	box: Box;
-	/**
-	 * The predicted label for the bounding box
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * The predicted bounding box. Coordinates are relative to the top left corner of the input
+     * image.
+     */
+    box: Box;
+    /**
+     * The predicted label for the bounding box
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
 
 /**
@@ -57,9 +59,9 @@ export interface ObjectDetectionOutput {
  * image.
  */
 export interface Box {
-	xmax: number;
-	xmin: number;
-	ymax: number;
-	ymin: number;
-	[property: string]: any;
+    xmax: number;
+    xmin: number;
+    ymax: number;
+    ymin: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
index 493c4b7e5..75293984d 100644
--- a/packages/tasks/src/tasks/question-answering/inference.ts
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -1,34 +1,36 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Question Answering inference
  */
 export interface QuestionAnsweringInput {
-	/**
-	 * One or several question+context pairs to answer
-	 */
-	inputs: SquadExample[] | SquadExample;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: QuestionAnsweringParameters;
-	[property: string]: any;
+    /**
+     * One or several question+context pairs to answer
+     */
+    inputs: SquadExample[] | SquadExample;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: QuestionAnsweringParameters;
+    [property: string]: unknown;
 }
 
 export interface SquadExample {
-	/**
-	 * The context to be used for answering the question
-	 */
-	context: string;
-	/**
-	 * The question to be answered
-	 */
-	question: string;
-	[property: string]: any;
+    /**
+     * The context to be used for answering the question
+     */
+    context: string;
+    /**
+     * The question to be answered
+     */
+    question: string;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,63 +39,63 @@ export interface SquadExample {
  * Additional inference parameters for Question Answering
  */
 export interface QuestionAnsweringParameters {
-	/**
-	 * Attempts to align the answer to real words. Improves quality on space separated
-	 * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
-	 */
-	alignToWords?: boolean;
-	/**
-	 * If the context is too long to fit with the question for the model, it will be split in
-	 * several chunks with some overlap. This argument controls the size of that overlap.
-	 */
-	docStride?: number;
-	/**
-	 * Whether to accept impossible as an answer.
-	 */
-	handleImpossibleAnswer?: boolean;
-	/**
-	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
-	 * considered).
-	 */
-	maxAnswerLen?: number;
-	/**
-	 * The maximum length of the question after tokenization. It will be truncated if needed.
-	 */
-	maxQuestionLen?: number;
-	/**
-	 * The maximum length of the total sentence (context + question) in tokens of each chunk
-	 * passed to the model. The context will be split in several chunks (using docStride as
-	 * overlap) if needed.
-	 */
-	maxSeqLen?: number;
-	/**
-	 * The number of answers to return (will be chosen by order of likelihood). Note that we
-	 * return less than topk answers if there are not enough options available within the
-	 * context.
-	 */
-	topK?: number;
-	[property: string]: any;
+    /**
+     * Attempts to align the answer to real words. Improves quality on space separated
+     * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
+     */
+    alignToWords?: boolean;
+    /**
+     * If the context is too long to fit with the question for the model, it will be split in
+     * several chunks with some overlap. This argument controls the size of that overlap.
+     */
+    docStride?: number;
+    /**
+     * Whether to accept impossible as an answer.
+     */
+    handleImpossibleAnswer?: boolean;
+    /**
+     * The maximum length of predicted answers (e.g., only answers with a shorter length are
+     * considered).
+     */
+    maxAnswerLen?: number;
+    /**
+     * The maximum length of the question after tokenization. It will be truncated if needed.
+     */
+    maxQuestionLen?: number;
+    /**
+     * The maximum length of the total sentence (context + question) in tokens of each chunk
+     * passed to the model. The context will be split in several chunks (using docStride as
+     * overlap) if needed.
+     */
+    maxSeqLen?: number;
+    /**
+     * The number of answers to return (will be chosen by order of likelihood). Note that we
+     * return less than topk answers if there are not enough options available within the
+     * context.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Question Answering task
  */
 export interface QuestionAnsweringOutput {
-	/**
-	 * The answer to the question.
-	 */
-	answer: string;
-	/**
-	 * The character position in the input where the answer ends.
-	 */
-	end: number;
-	/**
-	 * The probability associated to the answer.
-	 */
-	score: number;
-	/**
-	 * The character position in the input where the answer begins.
-	 */
-	start: number;
-	[property: string]: any;
+    /**
+     * The answer to the question.
+     */
+    answer: string;
+    /**
+     * The character position in the input where the answer ends.
+     */
+    end: number;
+    /**
+     * The probability associated to the answer.
+     */
+    score: number;
+    /**
+     * The character position in the input where the answer begins.
+     */
+    start: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/sentence-similarity/inference.ts b/packages/tasks/src/tasks/sentence-similarity/inference.ts
index 43dcc4777..40976f099 100644
--- a/packages/tasks/src/tasks/sentence-similarity/inference.ts
+++ b/packages/tasks/src/tasks/sentence-similarity/inference.ts
@@ -1,30 +1,34 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
+export type SentenceSimilarityOutput = number[];
+
 /**
  * Inputs for Sentence similarity inference
  */
 export interface SentenceSimilarityInput {
-	inputs: SentenceSimilarityInputSingle[] | SentenceSimilarityInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: any };
-	[property: string]: any;
+    inputs: SentenceSimilarityInputSingle[] | SentenceSimilarityInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 export interface SentenceSimilarityInputSingle {
-	/**
-	 * A list of strings which will be compared against the source_sentence.
-	 */
-	sentences: string[];
-	/**
-	 * The string that you wish to compare the other strings with. This can be a phrase,
-	 * sentence, or longer passage, depending on the model being used.
-	 */
-	sourceSentence: string;
-	[property: string]: any;
+    /**
+     * A list of strings which will be compared against the source_sentence.
+     */
+    sentences: string[];
+    /**
+     * The string that you wish to compare the other strings with. This can be a phrase,
+     * sentence, or longer passage, depending on the model being used.
+     */
+    sourceSentence: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
index b2e6272c5..6169e942b 100644
--- a/packages/tasks/src/tasks/summarization/inference.ts
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -1,24 +1,26 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Summarization inference
  *
  * Inputs for Text2text Generation inference
  */
 export interface SummarizationInput {
-	/**
-	 * One or more texts to use for text2text generation
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: Text2TextGenerationParameters;
-	[property: string]: any;
+    /**
+     * One or more texts to use for text2text generation
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: Text2TextGenerationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -27,19 +29,19 @@ export interface SummarizationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-	/**
-	 * Whether to clean up the potential extra spaces in the text output.
-	 */
-	cleanUpTokenizationSpaces?: boolean;
-	/**
-	 * Additional parametrization of the text generation algorithm
-	 */
-	parameters?: { [key: string]: any };
-	/**
-	 * The truncation strategy to use
-	 */
-	truncation?: Truncation;
-	[property: string]: any;
+    /**
+     * Whether to clean up the potential extra spaces in the text output.
+     */
+    cleanUpTokenizationSpaces?: boolean;
+    /**
+     * Additional parametrization of the text generation algorithm
+     */
+    parameters?: { [key: string]: unknown };
+    /**
+     * The truncation strategy to use
+     */
+    truncation?: Truncation;
+    [property: string]: unknown;
 }
 
 export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -50,9 +52,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface SummarizationOutput {
-	/**
-	 * The generated text.
-	 */
-	generatedText: string;
-	[property: string]: any;
+    /**
+     * The generated text.
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
index 58adb8c06..22e6b8832 100644
--- a/packages/tasks/src/tasks/table-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -1,59 +1,61 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Table Question Answering inference
  */
 export interface TableQuestionAnsweringInput {
-	/**
-	 * One or several questions about a table
-	 */
-	inputs: Inputs;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: any };
-	[property: string]: any;
+    /**
+     * One or several questions about a table
+     */
+    inputs: Inputs;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
  * One or several questions about a table
  */
 export interface Inputs {
-	/**
-	 * One or several questions to be answered about the table
-	 */
-	question?: string[] | string;
-	/**
-	 * The table to serve as context for the questions
-	 */
-	table?: { [key: string]: any };
-	[property: string]: any;
+    /**
+     * One or several questions to be answered about the table
+     */
+    question?: string[] | string;
+    /**
+     * The table to serve as context for the questions
+     */
+    table?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Table Question Answering task
  */
 export interface TableQuestionAnsweringOutput {
-	/**
-	 * If the model has an aggregator, this returns the aggregator.
-	 */
-	aggregator?: string;
-	/**
-	 * The answer of the question given the table. If there is an aggregator, the answer will be
-	 * preceded by `AGGREGATOR >`.
-	 */
-	answer: string;
-	/**
-	 * List of strings made up of the answer cell values.
-	 */
-	cells: string[];
-	/**
-	 * Coordinates of the cells of the answers.
-	 */
-	coordinates: Array<number[]>;
-	[property: string]: any;
+    /**
+     * If the model has an aggregator, this returns the aggregator.
+     */
+    aggregator?: string;
+    /**
+     * The answer of the question given the table. If there is an aggregator, the answer will be
+     * preceded by `AGGREGATOR >`.
+     */
+    answer: string;
+    /**
+     * List of strings made up of the answer cell values.
+     */
+    cells: string[];
+    /**
+     * Coordinates of the cells of the answers.
+     */
+    coordinates: Array<number[]>;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
index dc924889c..c490ee94d 100644
--- a/packages/tasks/src/tasks/text-classification/inference.ts
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text Classification inference
  */
 export interface TextClassificationInput {
-	/**
-	 * One or several texts to classify
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: TextClassificationParameters;
-	[property: string]: any;
+    /**
+     * One or several texts to classify
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TextClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,15 +27,15 @@ export interface TextClassificationInput {
  * Additional inference parameters for Text Classification
  */
 export interface TextClassificationParameters {
-	/**
-	 * The function to apply to the model outputs in order to retrieve the scores.
-	 */
-	functionToApply?: FunctionToApply;
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: any;
+    /**
+     * The function to apply to the model outputs in order to retrieve the scores.
+     */
+    functionToApply?: FunctionToApply;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 export type FunctionToApply = "sigmoid" | "softmax" | "none";
@@ -42,13 +44,13 @@ export type FunctionToApply = "sigmoid" | "softmax" | "none";
  * Outputs of inference for the Text Classification task
  */
 export interface TextClassificationOutput {
-	/**
-	 * The predicted class label (model specific).
-	 */
-	label: string;
-	/**
-	 * The corresponding probability.
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index 4d86e0a99..b28b1f225 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text Generation inference
  */
 export interface TextGenerationInput {
-	/**
-	 * The text to initialize generation with
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: TextGenerationParameters;
-	[property: string]: any;
+    /**
+     * The text to initialize generation with
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TextGenerationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,63 +27,63 @@ export interface TextGenerationInput {
  * Additional inference parameters for Text Generation
  */
 export interface TextGenerationParameters {
-	/**
-	 * Whether to use logit sampling (true) or greedy search (false).
-	 */
-	doSample?: boolean;
-	/**
-	 * Maximum number of generated tokens.
-	 */
-	maxNewTokens?: number;
-	/**
-	 * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
-	 * paper](https://hf.co/papers/1909.05858) for more details.
-	 */
-	repetitionPenalty?: number;
-	/**
-	 * Whether to prepend the prompt to the generated text.
-	 */
-	returnFullText?: boolean;
-	/**
-	 * Stop generating tokens if a member of `stop_sequences` is generated.
-	 */
-	stopSequences?: string[];
-	/**
-	 * The value used to modulate the logits distribution.
-	 */
-	temperature?: number;
-	/**
-	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
-	 */
-	topK?: number;
-	/**
-	 * If set to < 1, only the smallest set of most probable tokens with probabilities that add
-	 * up to `top_p` or higher are kept for generation.
-	 */
-	topP?: number;
-	/**
-	 * Truncate input tokens to the given size.
-	 */
-	truncate?: number;
-	/**
-	 * Typical Decoding mass. See [Typical Decoding for Natural Language
-	 * Generation](https://hf.co/papers/2202.00666) for more information
-	 */
-	typicalP?: number;
-	/**
-	 * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
-	 */
-	watermark?: boolean;
-	[property: string]: any;
+    /**
+     * Whether to use logit sampling (true) or greedy search (false).
+     */
+    doSample?: boolean;
+    /**
+     * Maximum number of generated tokens.
+     */
+    maxNewTokens?: number;
+    /**
+     * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
+     * paper](https://hf.co/papers/1909.05858) for more details.
+     */
+    repetitionPenalty?: number;
+    /**
+     * Whether to prepend the prompt to the generated text.
+     */
+    returnFullText?: boolean;
+    /**
+     * Stop generating tokens if a member of `stop_sequences` is generated.
+     */
+    stopSequences?: string[];
+    /**
+     * The value used to modulate the logits distribution.
+     */
+    temperature?: number;
+    /**
+     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+     */
+    topK?: number;
+    /**
+     * If set to < 1, only the smallest set of most probable tokens with probabilities that add
+     * up to `top_p` or higher are kept for generation.
+     */
+    topP?: number;
+    /**
+     * Truncate input tokens to the given size.
+     */
+    truncate?: number;
+    /**
+     * Typical Decoding mass. See [Typical Decoding for Natural Language
+     * Generation](https://hf.co/papers/2202.00666) for more information
+     */
+    typicalP?: number;
+    /**
+     * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
+     */
+    watermark?: boolean;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs for Text Generation inference
  */
 export interface TextGenerationOutput {
-	/**
-	 * The generated text
-	 */
-	generatedText: string;
-	[property: string]: any;
+    /**
+     * The generated text
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
index 3916184bd..f263ba834 100644
--- a/packages/tasks/src/tasks/text-to-audio/inference.ts
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -1,35 +1,37 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text To Audio inference
  */
 export interface TextToAudioInput {
-	/**
-	 * One or several texts to generate audio for
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: any };
-	[property: string]: any;
+    /**
+     * One or several texts to generate audio for
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToAudioOutput {
-	/**
-	 * The generated audio waveform.
-	 */
-	audio: any;
-	/**
-	 * The sampling rate of the generated audio waveform.
-	 */
-	samplingRate: number;
-	[property: string]: any;
+    /**
+     * The generated audio waveform.
+     */
+    audio: unknown;
+    /**
+     * The sampling rate of the generated audio waveform.
+     */
+    samplingRate: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index 137492b27..ca08be005 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -1,24 +1,26 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text to Speech inference
  *
  * Inputs for Text To Audio inference
  */
 export interface TextToSpeechInput {
-	/**
-	 * One or several texts to generate audio for
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: any };
-	[property: string]: any;
+    /**
+     * One or several texts to generate audio for
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
@@ -27,13 +29,13 @@ export interface TextToSpeechInput {
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToSpeechOutput {
-	/**
-	 * The generated audio waveform.
-	 */
-	audio: any;
-	/**
-	 * The sampling rate of the generated audio waveform.
-	 */
-	samplingRate: number;
-	[property: string]: any;
+    /**
+     * The generated audio waveform.
+     */
+    audio: unknown;
+    /**
+     * The sampling rate of the generated audio waveform.
+     */
+    samplingRate: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index 4cd870928..38aaf3ab2 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text2text Generation inference
  */
 export interface Text2TextGenerationInput {
-	/**
-	 * One or more texts to use for text2text generation
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: Text2TextGenerationParameters;
-	[property: string]: any;
+    /**
+     * One or more texts to use for text2text generation
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: Text2TextGenerationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,19 +27,19 @@ export interface Text2TextGenerationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-	/**
-	 * Whether to clean up the potential extra spaces in the text output.
-	 */
-	cleanUpTokenizationSpaces?: boolean;
-	/**
-	 * Additional parametrization of the text generation algorithm
-	 */
-	parameters?: { [key: string]: any };
-	/**
-	 * The truncation strategy to use
-	 */
-	truncation?: Truncation;
-	[property: string]: any;
+    /**
+     * Whether to clean up the potential extra spaces in the text output.
+     */
+    cleanUpTokenizationSpaces?: boolean;
+    /**
+     * Additional parametrization of the text generation algorithm
+     */
+    parameters?: { [key: string]: unknown };
+    /**
+     * The truncation strategy to use
+     */
+    truncation?: Truncation;
+    [property: string]: unknown;
 }
 
 export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -46,9 +48,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface Text2TextGenerationOutput {
-	/**
-	 * The generated text.
-	 */
-	generatedText: string;
-	[property: string]: any;
+    /**
+     * The generated text.
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
index dffcdcf38..7b82c12c6 100644
--- a/packages/tasks/src/tasks/token-classification/inference.ts
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Token Classification inference
  */
 export interface TokenClassificationInput {
-	/**
-	 * One or several texts which tokens are to be classified
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: TokenClassificationParameters;
-	[property: string]: any;
+    /**
+     * One or several texts which tokens are to be classified
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TokenClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,19 +27,19 @@ export interface TokenClassificationInput {
  * Additional inference parameters for Token Classification
  */
 export interface TokenClassificationParameters {
-	/**
-	 * The strategy used to fuse tokens based on model predictions
-	 */
-	aggregationStrategy?: AggregationStrategy;
-	/**
-	 * A list of labels to ignore
-	 */
-	ignoreLabels?: string[];
-	/**
-	 * The number of overlapping tokens between chunks when splitting the input text.
-	 */
-	stride?: number;
-	[property: string]: any;
+    /**
+     * The strategy used to fuse tokens based on model predictions
+     */
+    aggregationStrategy?: AggregationStrategy;
+    /**
+     * A list of labels to ignore
+     */
+    ignoreLabels?: string[];
+    /**
+     * The number of overlapping tokens between chunks when splitting the input text.
+     */
+    stride?: number;
+    [property: string]: unknown;
 }
 
 /**
@@ -60,26 +62,26 @@ export type AggregationStrategy = "none" | "simple" | "first" | "average" | "max
  * Outputs of inference for the Token Classification task
  */
 export interface TokenClassificationOutput {
-	/**
-	 * The character position in the input where this group ends.
-	 */
-	end?: number;
-	/**
-	 * The predicted label for that group of tokens
-	 */
-	entityGroup?: string;
-	label: any;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	/**
-	 * The character position in the input where this group begins.
-	 */
-	start?: number;
-	/**
-	 * The corresponding text
-	 */
-	word?: string;
-	[property: string]: any;
+    /**
+     * The character position in the input where this group ends.
+     */
+    end?: number;
+    /**
+     * The predicted label for that group of tokens
+     */
+    entityGroup?: string;
+    label:        unknown;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    /**
+     * The character position in the input where this group begins.
+     */
+    start?: number;
+    /**
+     * The corresponding text
+     */
+    word?: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
index c4f31b0ea..3b059542a 100644
--- a/packages/tasks/src/tasks/translation/inference.ts
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -1,24 +1,26 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Translation inference
  *
  * Inputs for Text2text Generation inference
  */
 export interface TranslationInput {
-	/**
-	 * One or more texts to use for text2text generation
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: Text2TextGenerationParameters;
-	[property: string]: any;
+    /**
+     * One or more texts to use for text2text generation
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: Text2TextGenerationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -27,19 +29,19 @@ export interface TranslationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-	/**
-	 * Whether to clean up the potential extra spaces in the text output.
-	 */
-	cleanUpTokenizationSpaces?: boolean;
-	/**
-	 * Additional parametrization of the text generation algorithm
-	 */
-	parameters?: { [key: string]: any };
-	/**
-	 * The truncation strategy to use
-	 */
-	truncation?: Truncation;
-	[property: string]: any;
+    /**
+     * Whether to clean up the potential extra spaces in the text output.
+     */
+    cleanUpTokenizationSpaces?: boolean;
+    /**
+     * Additional parametrization of the text generation algorithm
+     */
+    parameters?: { [key: string]: unknown };
+    /**
+     * The truncation strategy to use
+     */
+    truncation?: Truncation;
+    [property: string]: unknown;
 }
 
 export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -50,9 +52,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface TranslationOutput {
-	/**
-	 * The generated text.
-	 */
-	generatedText: string;
-	[property: string]: any;
+    /**
+     * The generated text.
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
index 7b2de7049..d98b4300e 100644
--- a/packages/tasks/src/tasks/video-classification/inference.ts
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Video Classification inference
  */
 export interface VideoClassificationInput {
-	/**
-	 * One or several videos to be classified
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: VideoClassificationParameters;
-	[property: string]: any;
+    /**
+     * One or several videos to be classified
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: VideoClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,32 +27,32 @@ export interface VideoClassificationInput {
  * Additional inference parameters for Video Classification
  */
 export interface VideoClassificationParameters {
-	/**
-	 * The sampling rate used to select frames from the video.
-	 */
-	frameSamplingRate?: number;
-	/**
-	 * The number of sampled frames to consider for classification.
-	 */
-	numFrames?: number;
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: any;
+    /**
+     * The sampling rate used to select frames from the video.
+     */
+    frameSamplingRate?: number;
+    /**
+     * The number of sampled frames to consider for classification.
+     */
+    numFrames?: number;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Video Classification task
  */
 export interface VideoClassificationOutput {
-	/**
-	 * The predicted class label (model specific).
-	 */
-	label: string;
-	/**
-	 * The corresponding probability.
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
index 05a57db48..da28c988e 100644
--- a/packages/tasks/src/tasks/visual-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -1,34 +1,36 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Visual Question Answering inference
  */
 export interface VisualQuestionAnsweringInput {
-	/**
-	 * One or more image-question pairs
-	 */
-	inputs: VisualQuestionAnsweringInputSingle[] | VisualQuestionAnsweringInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: VisualQuestionAnsweringParameters;
-	[property: string]: any;
+    /**
+     * One or more image-question pairs
+     */
+    inputs: VisualQuestionAnsweringInputSingle[] | VisualQuestionAnsweringInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: VisualQuestionAnsweringParameters;
+    [property: string]: unknown;
 }
 
 export interface VisualQuestionAnsweringInputSingle {
-	/**
-	 * The image.
-	 */
-	image: any;
-	/**
-	 * The question to answer based on the image.
-	 */
-	question: any;
-	[property: string]: any;
+    /**
+     * The image.
+     */
+    image: unknown;
+    /**
+     * The question to answer based on the image.
+     */
+    question: unknown;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,27 +39,27 @@ export interface VisualQuestionAnsweringInputSingle {
  * Additional inference parameters for Visual Question Answering
  */
 export interface VisualQuestionAnsweringParameters {
-	/**
-	 * The number of answers to return (will be chosen by order of likelihood). Note that we
-	 * return less than topk answers if there are not enough options available within the
-	 * context.
-	 */
-	topK?: number;
-	[property: string]: any;
+    /**
+     * The number of answers to return (will be chosen by order of likelihood). Note that we
+     * return less than topk answers if there are not enough options available within the
+     * context.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Visual Question Answering task
  */
 export interface VisualQuestionAnsweringOutput {
-	/**
-	 * The answer to the question
-	 */
-	answer?: string;
-	label: any;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * The answer to the question
+     */
+    answer?: string;
+    label:   unknown;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
index bc4497098..59b7cc3cd 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -1,34 +1,36 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Zero Shot Classification inference
  */
 export interface ZeroShotClassificationInput {
-	/**
-	 * One or several text + candidate labels pairs to classify
-	 */
-	inputs: ZeroShotClassificationInputSingle[] | ZeroShotClassificationInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ZeroShotClassificationParameters;
-	[property: string]: any;
+    /**
+     * One or several text + candidate labels pairs to classify
+     */
+    inputs: ZeroShotClassificationInputSingle[] | ZeroShotClassificationInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ZeroShotClassificationParameters;
+    [property: string]: unknown;
 }
 
 export interface ZeroShotClassificationInputSingle {
-	/**
-	 * The set of possible class labels to classify the text into.
-	 */
-	candidateLabels: string[];
-	/**
-	 * The text to classify
-	 */
-	text: string;
-	[property: string]: any;
+    /**
+     * The set of possible class labels to classify the text into.
+     */
+    candidateLabels: string[];
+    /**
+     * The text to classify
+     */
+    text: string;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,31 +39,31 @@ export interface ZeroShotClassificationInputSingle {
  * Additional inference parameters for Zero Shot Classification
  */
 export interface ZeroShotClassificationParameters {
-	/**
-	 * The sentence used in conjunction with candidateLabels to attempt the text classification
-	 * by replacing the placeholder with the candidate labels.
-	 */
-	hypothesisTemplate?: string;
-	/**
-	 * Whether multiple candidate labels can be true. If false, the scores are normalized such
-	 * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
-	 * considered independent and probabilities are normalized for each candidate.
-	 */
-	multiLabel?: boolean;
-	[property: string]: any;
+    /**
+     * The sentence used in conjunction with candidateLabels to attempt the text classification
+     * by replacing the placeholder with the candidate labels.
+     */
+    hypothesisTemplate?: string;
+    /**
+     * Whether multiple candidate labels can be true. If false, the scores are normalized such
+     * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
+     * considered independent and probabilities are normalized for each candidate.
+     */
+    multiLabel?: boolean;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Classification task
  */
 export interface ZeroShotClassificationOutput {
-	/**
-	 * A candidate label
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * A candidate label
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 4ae4ff04e..38aafb6a1 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -1,34 +1,36 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Zero Shot Image Classification inference
  */
 export interface ZeroShotImageClassificationInput {
-	/**
-	 * One or several images to classify
-	 */
-	inputs: ZeroShotImageClassificationInputSingle[] | ZeroShotImageClassificationInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ZeroShotImageClassificationParameters;
-	[property: string]: any;
+    /**
+     * One or several images to classify
+     */
+    inputs: ZeroShotImageClassificationInputSingle[] | ZeroShotImageClassificationInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ZeroShotImageClassificationParameters;
+    [property: string]: unknown;
 }
 
 export interface ZeroShotImageClassificationInputSingle {
-	/**
-	 * The candidate labels for this image
-	 */
-	candidateLabels: string[];
-	/**
-	 * The image data to classify
-	 */
-	image: any;
-	[property: string]: any;
+    /**
+     * The candidate labels for this image
+     */
+    candidateLabels: string[];
+    /**
+     * The image data to classify
+     */
+    image: unknown;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,25 +39,25 @@ export interface ZeroShotImageClassificationInputSingle {
  * Additional inference parameters for Zero Shot Image Classification
  */
 export interface ZeroShotImageClassificationParameters {
-	/**
-	 * The sentence used in conjunction with candidateLabels to attempt the text classification
-	 * by replacing the placeholder with the candidate labels.
-	 */
-	hypothesisTemplate?: string;
-	[property: string]: any;
+    /**
+     * The sentence used in conjunction with candidateLabels to attempt the text classification
+     * by replacing the placeholder with the candidate labels.
+     */
+    hypothesisTemplate?: string;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Image Classification task
  */
 export interface ZeroShotImageClassificationOutput {
-	/**
-	 * A candidate label
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * A candidate label
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index 64162ae7c..e9ef360bf 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -1,54 +1,56 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Zero Shot Object Detection inference
  */
 export interface ZeroShotObjectDetectionInput {
-	/**
-	 * One or several images to perform object detection on
-	 */
-	inputs: ZeroShotObjectDetectionInputSingle[] | ZeroShotObjectDetectionInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: any };
-	[property: string]: any;
+    /**
+     * One or several images to perform object detection on
+     */
+    inputs: ZeroShotObjectDetectionInputSingle[] | ZeroShotObjectDetectionInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 export interface ZeroShotObjectDetectionInputSingle {
-	/**
-	 * The candidate labels for this image
-	 */
-	candidateLabels: string[];
-	/**
-	 * The image data to generate bounding boxes from
-	 */
-	image: any;
-	[property: string]: any;
+    /**
+     * The candidate labels for this image
+     */
+    candidateLabels: string[];
+    /**
+     * The image data to generate bounding boxes from
+     */
+    image: unknown;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Object Detection task
  */
 export interface ZeroShotObjectDetectionOutput {
-	/**
-	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
-	 * image.
-	 */
-	box: Box;
-	/**
-	 * A candidate label
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * The predicted bounding box. Coordinates are relative to the top left corner of the input
+     * image.
+     */
+    box: Box;
+    /**
+     * A candidate label
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
 
 /**
@@ -56,9 +58,9 @@ export interface ZeroShotObjectDetectionOutput {
  * image.
  */
 export interface Box {
-	xmax: number;
-	xmin: number;
-	ymax: number;
-	ymin: number;
-	[property: string]: any;
+    xmax: number;
+    xmin: number;
+    ymax: number;
+    ymin: number;
+    [property: string]: unknown;
 }

From d4ec5350b0d56f0153aca155a53617db6f23f8ff Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 11:00:11 +0100
Subject: [PATCH 22/51] =?UTF-8?q?=F0=9F=92=84format=20with=20pnpm?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/package.json                   |   2 +-
 .../tasks/audio-classification/inference.ts   |  50 +++---
 .../automatic-speech-recognition/inference.ts |  32 ++--
 .../src/tasks/depth-estimation/inference.ts   |  32 ++--
 .../document-question-answering/inference.ts  | 150 +++++++++---------
 .../src/tasks/feature-extraction/inference.ts |  22 ++-
 .../tasks/feature-extraction/spec/output.json |   2 +-
 .../tasks/src/tasks/fill-mask/inference.ts    |  80 +++++-----
 .../tasks/image-classification/inference.ts   |  50 +++---
 .../src/tasks/image-segmentation/inference.ts |  74 +++++----
 .../src/tasks/image-to-image/inference.ts     |  22 ++-
 .../src/tasks/image-to-text/inference.ts      |  42 +++--
 .../src/tasks/object-detection/inference.ts   |  70 ++++----
 .../src/tasks/question-answering/inference.ts | 146 +++++++++--------
 .../tasks/sentence-similarity/inference.ts    |  36 ++---
 .../src/tasks/summarization/inference.ts      |  58 ++++---
 .../table-question-answering/inference.ts     |  76 +++++----
 .../tasks/text-classification/inference.ts    |  58 ++++---
 .../src/tasks/text-generation/inference.ts    | 128 ++++++++-------
 .../src/tasks/text-to-audio/inference.ts      |  40 +++--
 .../src/tasks/text-to-speech/inference.ts     |  40 +++--
 .../tasks/text2text-generation/inference.ts   |  58 ++++---
 .../tasks/token-classification/inference.ts   |  92 ++++++-----
 .../tasks/src/tasks/translation/inference.ts  |  58 ++++---
 .../tasks/video-classification/inference.ts   |  66 ++++----
 .../visual-question-answering/inference.ts    |  74 +++++----
 .../zero-shot-classification/inference.ts     |  82 +++++-----
 .../inference.ts                              |  70 ++++----
 .../zero-shot-object-detection/inference.ts   |  78 +++++----
 29 files changed, 867 insertions(+), 921 deletions(-)

diff --git a/packages/tasks/package.json b/packages/tasks/package.json
index 7077133d0..e61a09163 100644
--- a/packages/tasks/package.json
+++ b/packages/tasks/package.json
@@ -45,4 +45,4 @@
 		"@types/node": "^20.11.5",
 		"quicktype-core": "https://github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz"
 	}
-}
\ No newline at end of file
+}
diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index d65ead71b..2ac2e5065 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Audio Classification inference
  */
 export interface AudioClassificationInput {
-    /**
-     * On or several audio files to classify
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: AudioClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * On or several audio files to classify
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: AudioClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,24 +25,24 @@ export interface AudioClassificationInput {
  * Additional inference parameters for Audio Classification
  */
 export interface AudioClassificationParameters {
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs for Audio Classification inference
  */
 export interface AudioClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index abaa0caef..6eb20d0c1 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -1,33 +1,31 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Automatic Speech Recognition inference
  */
 export interface AutomaticSpeechRecognitionInput {
-    /**
-     * The input audio data
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * The input audio data
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Automatic Speech Recognition task
  */
 export interface AutomaticSpeechRecognitionOutput {
-    /**
-     * The recognized text.
-     */
-    text: string;
-    [property: string]: unknown;
+	/**
+	 * The recognized text.
+	 */
+	text: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/depth-estimation/inference.ts b/packages/tasks/src/tasks/depth-estimation/inference.ts
index ba5975a74..48b9d3438 100644
--- a/packages/tasks/src/tasks/depth-estimation/inference.ts
+++ b/packages/tasks/src/tasks/depth-estimation/inference.ts
@@ -1,26 +1,24 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 export type DepthEstimationOutput = unknown[];
 
 /**
  * Inputs for Depth Estimation inference
  */
 export interface DepthEstimationInput {
-    /**
-     * The input image data
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: DepthEstimationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input image data
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: DepthEstimationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -29,9 +27,9 @@ export interface DepthEstimationInput {
  * Additional inference parameters for Depth Estimation
  */
 export interface DepthEstimationParameters {
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index 6c730e277..8dec0976f 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -1,36 +1,34 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Document Question Answering inference
  */
 export interface DocumentQuestionAnsweringInput {
-    /**
-     * The
-     */
-    inputs: DocumentQuestionAnsweringInpu[] | DocumentQuestionAnsweringInpu;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: DocumentQuestionAnsweringParameters;
-    [property: string]: unknown;
+	/**
+	 * The
+	 */
+	inputs: DocumentQuestionAnsweringInpu[] | DocumentQuestionAnsweringInpu;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: DocumentQuestionAnsweringParameters;
+	[property: string]: unknown;
 }
 
 export interface DocumentQuestionAnsweringInpu {
-    /**
-     * The image on which the question is asked
-     */
-    image?: unknown;
-    /**
-     * A question to ask of the document
-     */
-    question?: string;
-    [property: string]: unknown;
+	/**
+	 * The image on which the question is asked
+	 */
+	image?: unknown;
+	/**
+	 * A question to ask of the document
+	 */
+	question?: string;
+	[property: string]: unknown;
 }
 
 /**
@@ -39,65 +37,65 @@ export interface DocumentQuestionAnsweringInpu {
  * Additional inference parameters for Document Question Answering
  */
 export interface DocumentQuestionAnsweringParameters {
-    /**
-     * If the words in the document are too long to fit with the question for the model, it will
-     * be split in several chunks with some overlap. This argument controls the size of that
-     * overlap.
-     */
-    docStride?: number;
-    /**
-     * Whether to accept impossible as an answer
-     */
-    handleImpossibleAnswer?: boolean;
-    /**
-     * Language to use while running OCR. Defaults to english.
-     */
-    lang?: string;
-    /**
-     * The maximum length of predicted answers (e.g., only answers with a shorter length are
-     * considered).
-     */
-    maxAnswerLen?: number;
-    /**
-     * The maximum length of the question after tokenization. It will be truncated if needed.
-     */
-    maxQuestionLen?: number;
-    /**
-     * The maximum length of the total sentence (context + question) in tokens of each chunk
-     * passed to the model. The context will be split in several chunks (using doc_stride as
-     * overlap) if needed.
-     */
-    maxSeqLen?: number;
-    /**
-     * The number of answers to return (will be chosen by order of likelihood). Can return less
-     * than top_k answers if there are not enough options available within the context.
-     */
-    topK?: number;
-    /**
-     * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
-     * skip the OCR step and use the provided bounding boxes instead.
-     */
-    wordBoxes?: Array<number[] | string>;
-    [property: string]: unknown;
+	/**
+	 * If the words in the document are too long to fit with the question for the model, it will
+	 * be split in several chunks with some overlap. This argument controls the size of that
+	 * overlap.
+	 */
+	docStride?: number;
+	/**
+	 * Whether to accept impossible as an answer
+	 */
+	handleImpossibleAnswer?: boolean;
+	/**
+	 * Language to use while running OCR. Defaults to english.
+	 */
+	lang?: string;
+	/**
+	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
+	 * considered).
+	 */
+	maxAnswerLen?: number;
+	/**
+	 * The maximum length of the question after tokenization. It will be truncated if needed.
+	 */
+	maxQuestionLen?: number;
+	/**
+	 * The maximum length of the total sentence (context + question) in tokens of each chunk
+	 * passed to the model. The context will be split in several chunks (using doc_stride as
+	 * overlap) if needed.
+	 */
+	maxSeqLen?: number;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Can return less
+	 * than top_k answers if there are not enough options available within the context.
+	 */
+	topK?: number;
+	/**
+	 * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
+	 * skip the OCR step and use the provided bounding boxes instead.
+	 */
+	wordBoxes?: Array<number[] | string>;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Document Question Answering task
  */
 export interface DocumentQuestionAnsweringOutput {
-    /**
-     * The answer to the question.
-     */
-    answer: string;
-    end:    number;
-    /**
-     * The probability associated to the answer.
-     */
-    score: number;
-    start: number;
-    /**
-     * The index of each word/box pair that is in the answer
-     */
-    words: number[];
-    [property: string]: unknown;
+	/**
+	 * The answer to the question.
+	 */
+	answer: string;
+	end: number;
+	/**
+	 * The probability associated to the answer.
+	 */
+	score: number;
+	start: number;
+	/**
+	 * The index of each word/box pair that is in the answer
+	 */
+	words: number[];
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
index c6b6dcec5..5c237d6dd 100644
--- a/packages/tasks/src/tasks/feature-extraction/inference.ts
+++ b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -1,22 +1,20 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Feature Extraction inference
  */
 export interface FeatureExtractionInput {
-    /**
-     * One or several texts to get the features of
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * One or several texts to get the features of
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/output.json b/packages/tasks/src/tasks/feature-extraction/spec/output.json
index 47303e945..54a29d10e 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/output.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/output.json
@@ -48,4 +48,4 @@
 			}
 		]
 	}
-}
\ No newline at end of file
+}
diff --git a/packages/tasks/src/tasks/fill-mask/inference.ts b/packages/tasks/src/tasks/fill-mask/inference.ts
index 380ae8cd2..097718900 100644
--- a/packages/tasks/src/tasks/fill-mask/inference.ts
+++ b/packages/tasks/src/tasks/fill-mask/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Fill Mask inference
  */
 export interface FillMaskInput {
-    /**
-     * One or several texts with masked tokens
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: FillMaskParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several texts with masked tokens
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: FillMaskParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,39 +25,39 @@ export interface FillMaskInput {
  * Additional inference parameters for Fill Mask
  */
 export interface FillMaskParameters {
-    /**
-     * When passed, the model will limit the scores to the passed targets instead of looking up
-     * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
-     * tokenized and the first resulting token will be used (with a warning, and that might be
-     * slower).
-     */
-    targets?: string[] | string;
-    /**
-     * When passed, overrides the number of predictions to return.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * When passed, the model will limit the scores to the passed targets instead of looking up
+	 * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
+	 * tokenized and the first resulting token will be used (with a warning, and that might be
+	 * slower).
+	 */
+	targets?: string[] | string;
+	/**
+	 * When passed, overrides the number of predictions to return.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Fill Mask task
  */
 export interface FillMaskOutput {
-    /**
-     * The corresponding probability
-     */
-    score: number;
-    /**
-     * The corresponding input with the mask token prediction.
-     */
-    sequence: string;
-    /**
-     * The predicted token id (to replace the masked one).
-     */
-    token: number;
-    /**
-     * The predicted token (to replace the masked one).
-     */
-    tokenStr: string;
-    [property: string]: unknown;
+	/**
+	 * The corresponding probability
+	 */
+	score: number;
+	/**
+	 * The corresponding input with the mask token prediction.
+	 */
+	sequence: string;
+	/**
+	 * The predicted token id (to replace the masked one).
+	 */
+	token: number;
+	/**
+	 * The predicted token (to replace the masked one).
+	 */
+	tokenStr: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index b6700e06e..dfff0cfd9 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Image Classification inference
  */
 export interface ImageClassificationInput {
-    /**
-     * On or several image files to classify
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ImageClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * On or several image files to classify
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,24 +25,24 @@ export interface ImageClassificationInput {
  * Additional inference parameters for Image Classification
  */
 export interface ImageClassificationParameters {
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Image Classification task
  */
 export interface ImageClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index 63fae5288..13c15cb72 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Image Segmentation inference
  */
 export interface ImageSegmentationInput {
-    /**
-     * One or several image files to perform segmentation on
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ImageSegmentationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several image files to perform segmentation on
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageSegmentationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,23 +25,23 @@ export interface ImageSegmentationInput {
  * Additional inference parameters for Image Segmentation
  */
 export interface ImageSegmentationParameters {
-    /**
-     * Threshold to use when turning the predicted masks into binary values.
-     */
-    maskThreshold?: number;
-    /**
-     * Mask overlap threshold to eliminate small, disconnected segments.
-     */
-    overlapMaskAreaThreshold?: number;
-    /**
-     * Segmentation task to be performed, depending on model capabilities.
-     */
-    subtask?: Subtask;
-    /**
-     * Probability threshold to filter out predicted masks.
-     */
-    threshold?: number;
-    [property: string]: unknown;
+	/**
+	 * Threshold to use when turning the predicted masks into binary values.
+	 */
+	maskThreshold?: number;
+	/**
+	 * Mask overlap threshold to eliminate small, disconnected segments.
+	 */
+	overlapMaskAreaThreshold?: number;
+	/**
+	 * Segmentation task to be performed, depending on model capabilities.
+	 */
+	subtask?: Subtask;
+	/**
+	 * Probability threshold to filter out predicted masks.
+	 */
+	threshold?: number;
+	[property: string]: unknown;
 }
 
 export type Subtask = "instance" | "panoptic" | "semantic";
@@ -54,13 +52,13 @@ export type Subtask = "instance" | "panoptic" | "semantic";
  * A predicted mask / segment
  */
 export interface ImageSegmentationOutput {
-    /**
-     * The label of the predicted segment
-     */
-    label: string;
-    /**
-     * The corresponding mask as a black-and-white image
-     */
-    mask: unknown;
-    [property: string]: unknown;
+	/**
+	 * The label of the predicted segment
+	 */
+	label: string;
+	/**
+	 * The corresponding mask as a black-and-white image
+	 */
+	mask: unknown;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
index 6fb0f997e..c1f1a5cb8 100644
--- a/packages/tasks/src/tasks/image-to-image/inference.ts
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 export type ImageToImageOutput = unknown[];
 
 /**
  * Inputs for Image To Image inference
  */
 export interface ImageToImageInput {
-    /**
-     * One or more images to generate images from
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: unknown;
-    [property: string]: unknown;
+	/**
+	 * One or more images to generate images from
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: unknown;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index 12e0d4968..029db76da 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Image To Text inference
  */
 export interface ImageToTextInput {
-    /**
-     * One or several images to generated text for
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ImageToTextParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several images to generated text for
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageToTextParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,20 +25,20 @@ export interface ImageToTextInput {
  * Additional inference parameters for Image To Text
  */
 export interface ImageToTextParameters {
-    /**
-     * The amount of maximum tokens to generate.
-     */
-    maxNewTokens?: number;
-    [property: string]: unknown;
+	/**
+	 * The amount of maximum tokens to generate.
+	 */
+	maxNewTokens?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Image To Text task
  */
 export interface ImageToTextOutput {
-    /**
-     * The generated text.
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index d294110f7..228063fc0 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Object Detection inference
  */
 export interface ObjectDetectionInput {
-    /**
-     * One or several input images to perform object detection on
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ObjectDetectionParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several input images to perform object detection on
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ObjectDetectionParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,31 +25,31 @@ export interface ObjectDetectionInput {
  * Additional inference parameters for Object Detection
  */
 export interface ObjectDetectionParameters {
-    /**
-     * The probability necessary to make a prediction.
-     */
-    threshold?: number;
-    [property: string]: unknown;
+	/**
+	 * The probability necessary to make a prediction.
+	 */
+	threshold?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Object Detection task
  */
 export interface ObjectDetectionOutput {
-    /**
-     * The predicted bounding box. Coordinates are relative to the top left corner of the input
-     * image.
-     */
-    box: Box;
-    /**
-     * The predicted label for the bounding box
-     */
-    label: string;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
+	 * image.
+	 */
+	box: Box;
+	/**
+	 * The predicted label for the bounding box
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
 
 /**
@@ -59,9 +57,9 @@ export interface ObjectDetectionOutput {
  * image.
  */
 export interface Box {
-    xmax: number;
-    xmin: number;
-    ymax: number;
-    ymin: number;
-    [property: string]: unknown;
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
index 75293984d..58da43f6d 100644
--- a/packages/tasks/src/tasks/question-answering/inference.ts
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -1,36 +1,34 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Question Answering inference
  */
 export interface QuestionAnsweringInput {
-    /**
-     * One or several question+context pairs to answer
-     */
-    inputs: SquadExample[] | SquadExample;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: QuestionAnsweringParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several question+context pairs to answer
+	 */
+	inputs: SquadExample[] | SquadExample;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: QuestionAnsweringParameters;
+	[property: string]: unknown;
 }
 
 export interface SquadExample {
-    /**
-     * The context to be used for answering the question
-     */
-    context: string;
-    /**
-     * The question to be answered
-     */
-    question: string;
-    [property: string]: unknown;
+	/**
+	 * The context to be used for answering the question
+	 */
+	context: string;
+	/**
+	 * The question to be answered
+	 */
+	question: string;
+	[property: string]: unknown;
 }
 
 /**
@@ -39,63 +37,63 @@ export interface SquadExample {
  * Additional inference parameters for Question Answering
  */
 export interface QuestionAnsweringParameters {
-    /**
-     * Attempts to align the answer to real words. Improves quality on space separated
-     * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
-     */
-    alignToWords?: boolean;
-    /**
-     * If the context is too long to fit with the question for the model, it will be split in
-     * several chunks with some overlap. This argument controls the size of that overlap.
-     */
-    docStride?: number;
-    /**
-     * Whether to accept impossible as an answer.
-     */
-    handleImpossibleAnswer?: boolean;
-    /**
-     * The maximum length of predicted answers (e.g., only answers with a shorter length are
-     * considered).
-     */
-    maxAnswerLen?: number;
-    /**
-     * The maximum length of the question after tokenization. It will be truncated if needed.
-     */
-    maxQuestionLen?: number;
-    /**
-     * The maximum length of the total sentence (context + question) in tokens of each chunk
-     * passed to the model. The context will be split in several chunks (using docStride as
-     * overlap) if needed.
-     */
-    maxSeqLen?: number;
-    /**
-     * The number of answers to return (will be chosen by order of likelihood). Note that we
-     * return less than topk answers if there are not enough options available within the
-     * context.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * Attempts to align the answer to real words. Improves quality on space separated
+	 * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
+	 */
+	alignToWords?: boolean;
+	/**
+	 * If the context is too long to fit with the question for the model, it will be split in
+	 * several chunks with some overlap. This argument controls the size of that overlap.
+	 */
+	docStride?: number;
+	/**
+	 * Whether to accept impossible as an answer.
+	 */
+	handleImpossibleAnswer?: boolean;
+	/**
+	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
+	 * considered).
+	 */
+	maxAnswerLen?: number;
+	/**
+	 * The maximum length of the question after tokenization. It will be truncated if needed.
+	 */
+	maxQuestionLen?: number;
+	/**
+	 * The maximum length of the total sentence (context + question) in tokens of each chunk
+	 * passed to the model. The context will be split in several chunks (using docStride as
+	 * overlap) if needed.
+	 */
+	maxSeqLen?: number;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Note that we
+	 * return less than topk answers if there are not enough options available within the
+	 * context.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Question Answering task
  */
 export interface QuestionAnsweringOutput {
-    /**
-     * The answer to the question.
-     */
-    answer: string;
-    /**
-     * The character position in the input where the answer ends.
-     */
-    end: number;
-    /**
-     * The probability associated to the answer.
-     */
-    score: number;
-    /**
-     * The character position in the input where the answer begins.
-     */
-    start: number;
-    [property: string]: unknown;
+	/**
+	 * The answer to the question.
+	 */
+	answer: string;
+	/**
+	 * The character position in the input where the answer ends.
+	 */
+	end: number;
+	/**
+	 * The probability associated to the answer.
+	 */
+	score: number;
+	/**
+	 * The character position in the input where the answer begins.
+	 */
+	start: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/sentence-similarity/inference.ts b/packages/tasks/src/tasks/sentence-similarity/inference.ts
index 40976f099..252326caf 100644
--- a/packages/tasks/src/tasks/sentence-similarity/inference.ts
+++ b/packages/tasks/src/tasks/sentence-similarity/inference.ts
@@ -1,34 +1,32 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 export type SentenceSimilarityOutput = number[];
 
 /**
  * Inputs for Sentence similarity inference
  */
 export interface SentenceSimilarityInput {
-    inputs: SentenceSimilarityInputSingle[] | SentenceSimilarityInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	inputs: SentenceSimilarityInputSingle[] | SentenceSimilarityInputSingle;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 export interface SentenceSimilarityInputSingle {
-    /**
-     * A list of strings which will be compared against the source_sentence.
-     */
-    sentences: string[];
-    /**
-     * The string that you wish to compare the other strings with. This can be a phrase,
-     * sentence, or longer passage, depending on the model being used.
-     */
-    sourceSentence: string;
-    [property: string]: unknown;
+	/**
+	 * A list of strings which will be compared against the source_sentence.
+	 */
+	sentences: string[];
+	/**
+	 * The string that you wish to compare the other strings with. This can be a phrase,
+	 * sentence, or longer passage, depending on the model being used.
+	 */
+	sourceSentence: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
index 6169e942b..4a2fd40a5 100644
--- a/packages/tasks/src/tasks/summarization/inference.ts
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -1,26 +1,24 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Summarization inference
  *
  * Inputs for Text2text Generation inference
  */
 export interface SummarizationInput {
-    /**
-     * One or more texts to use for text2text generation
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: Text2TextGenerationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or more texts to use for text2text generation
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Text2TextGenerationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -29,19 +27,19 @@ export interface SummarizationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-    /**
-     * Whether to clean up the potential extra spaces in the text output.
-     */
-    cleanUpTokenizationSpaces?: boolean;
-    /**
-     * Additional parametrization of the text generation algorithm
-     */
-    parameters?: { [key: string]: unknown };
-    /**
-     * The truncation strategy to use
-     */
-    truncation?: Truncation;
-    [property: string]: unknown;
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	parameters?: { [key: string]: unknown };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Truncation;
+	[property: string]: unknown;
 }
 
 export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -52,9 +50,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface SummarizationOutput {
-    /**
-     * The generated text.
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
index 22e6b8832..35b172a5d 100644
--- a/packages/tasks/src/tasks/table-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -1,61 +1,59 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Table Question Answering inference
  */
 export interface TableQuestionAnsweringInput {
-    /**
-     * One or several questions about a table
-     */
-    inputs: Inputs;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * One or several questions about a table
+	 */
+	inputs: Inputs;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * One or several questions about a table
  */
 export interface Inputs {
-    /**
-     * One or several questions to be answered about the table
-     */
-    question?: string[] | string;
-    /**
-     * The table to serve as context for the questions
-     */
-    table?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * One or several questions to be answered about the table
+	 */
+	question?: string[] | string;
+	/**
+	 * The table to serve as context for the questions
+	 */
+	table?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Table Question Answering task
  */
 export interface TableQuestionAnsweringOutput {
-    /**
-     * If the model has an aggregator, this returns the aggregator.
-     */
-    aggregator?: string;
-    /**
-     * The answer of the question given the table. If there is an aggregator, the answer will be
-     * preceded by `AGGREGATOR >`.
-     */
-    answer: string;
-    /**
-     * List of strings made up of the answer cell values.
-     */
-    cells: string[];
-    /**
-     * Coordinates of the cells of the answers.
-     */
-    coordinates: Array<number[]>;
-    [property: string]: unknown;
+	/**
+	 * If the model has an aggregator, this returns the aggregator.
+	 */
+	aggregator?: string;
+	/**
+	 * The answer of the question given the table. If there is an aggregator, the answer will be
+	 * preceded by `AGGREGATOR >`.
+	 */
+	answer: string;
+	/**
+	 * List of strings made up of the answer cell values.
+	 */
+	cells: string[];
+	/**
+	 * Coordinates of the cells of the answers.
+	 */
+	coordinates: Array<number[]>;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
index c490ee94d..33648fdd8 100644
--- a/packages/tasks/src/tasks/text-classification/inference.ts
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text Classification inference
  */
 export interface TextClassificationInput {
-    /**
-     * One or several texts to classify
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: TextClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several texts to classify
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,15 +25,15 @@ export interface TextClassificationInput {
  * Additional inference parameters for Text Classification
  */
 export interface TextClassificationParameters {
-    /**
-     * The function to apply to the model outputs in order to retrieve the scores.
-     */
-    functionToApply?: FunctionToApply;
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * The function to apply to the model outputs in order to retrieve the scores.
+	 */
+	functionToApply?: FunctionToApply;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 export type FunctionToApply = "sigmoid" | "softmax" | "none";
@@ -44,13 +42,13 @@ export type FunctionToApply = "sigmoid" | "softmax" | "none";
  * Outputs of inference for the Text Classification task
  */
 export interface TextClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index b28b1f225..62af0a9c5 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text Generation inference
  */
 export interface TextGenerationInput {
-    /**
-     * The text to initialize generation with
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: TextGenerationParameters;
-    [property: string]: unknown;
+	/**
+	 * The text to initialize generation with
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextGenerationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,63 +25,63 @@ export interface TextGenerationInput {
  * Additional inference parameters for Text Generation
  */
 export interface TextGenerationParameters {
-    /**
-     * Whether to use logit sampling (true) or greedy search (false).
-     */
-    doSample?: boolean;
-    /**
-     * Maximum number of generated tokens.
-     */
-    maxNewTokens?: number;
-    /**
-     * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
-     * paper](https://hf.co/papers/1909.05858) for more details.
-     */
-    repetitionPenalty?: number;
-    /**
-     * Whether to prepend the prompt to the generated text.
-     */
-    returnFullText?: boolean;
-    /**
-     * Stop generating tokens if a member of `stop_sequences` is generated.
-     */
-    stopSequences?: string[];
-    /**
-     * The value used to modulate the logits distribution.
-     */
-    temperature?: number;
-    /**
-     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
-     */
-    topK?: number;
-    /**
-     * If set to < 1, only the smallest set of most probable tokens with probabilities that add
-     * up to `top_p` or higher are kept for generation.
-     */
-    topP?: number;
-    /**
-     * Truncate input tokens to the given size.
-     */
-    truncate?: number;
-    /**
-     * Typical Decoding mass. See [Typical Decoding for Natural Language
-     * Generation](https://hf.co/papers/2202.00666) for more information
-     */
-    typicalP?: number;
-    /**
-     * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
-     */
-    watermark?: boolean;
-    [property: string]: unknown;
+	/**
+	 * Whether to use logit sampling (true) or greedy search (false).
+	 */
+	doSample?: boolean;
+	/**
+	 * Maximum number of generated tokens.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
+	 * paper](https://hf.co/papers/1909.05858) for more details.
+	 */
+	repetitionPenalty?: number;
+	/**
+	 * Whether to prepend the prompt to the generated text.
+	 */
+	returnFullText?: boolean;
+	/**
+	 * Stop generating tokens if a member of `stop_sequences` is generated.
+	 */
+	stopSequences?: string[];
+	/**
+	 * The value used to modulate the logits distribution.
+	 */
+	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to < 1, only the smallest set of most probable tokens with probabilities that add
+	 * up to `top_p` or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Truncate input tokens to the given size.
+	 */
+	truncate?: number;
+	/**
+	 * Typical Decoding mass. See [Typical Decoding for Natural Language
+	 * Generation](https://hf.co/papers/2202.00666) for more information
+	 */
+	typicalP?: number;
+	/**
+	 * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
+	 */
+	watermark?: boolean;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs for Text Generation inference
  */
 export interface TextGenerationOutput {
-    /**
-     * The generated text
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
index f263ba834..3b23948a3 100644
--- a/packages/tasks/src/tasks/text-to-audio/inference.ts
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -1,37 +1,35 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text To Audio inference
  */
 export interface TextToAudioInput {
-    /**
-     * One or several texts to generate audio for
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * One or several texts to generate audio for
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToAudioOutput {
-    /**
-     * The generated audio waveform.
-     */
-    audio: unknown;
-    /**
-     * The sampling rate of the generated audio waveform.
-     */
-    samplingRate: number;
-    [property: string]: unknown;
+	/**
+	 * The generated audio waveform.
+	 */
+	audio: unknown;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	samplingRate: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index ca08be005..766b23a38 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -1,26 +1,24 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text to Speech inference
  *
  * Inputs for Text To Audio inference
  */
 export interface TextToSpeechInput {
-    /**
-     * One or several texts to generate audio for
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * One or several texts to generate audio for
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
@@ -29,13 +27,13 @@ export interface TextToSpeechInput {
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToSpeechOutput {
-    /**
-     * The generated audio waveform.
-     */
-    audio: unknown;
-    /**
-     * The sampling rate of the generated audio waveform.
-     */
-    samplingRate: number;
-    [property: string]: unknown;
+	/**
+	 * The generated audio waveform.
+	 */
+	audio: unknown;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	samplingRate: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index 38aaf3ab2..5241648d3 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text2text Generation inference
  */
 export interface Text2TextGenerationInput {
-    /**
-     * One or more texts to use for text2text generation
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: Text2TextGenerationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or more texts to use for text2text generation
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Text2TextGenerationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,19 +25,19 @@ export interface Text2TextGenerationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-    /**
-     * Whether to clean up the potential extra spaces in the text output.
-     */
-    cleanUpTokenizationSpaces?: boolean;
-    /**
-     * Additional parametrization of the text generation algorithm
-     */
-    parameters?: { [key: string]: unknown };
-    /**
-     * The truncation strategy to use
-     */
-    truncation?: Truncation;
-    [property: string]: unknown;
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	parameters?: { [key: string]: unknown };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Truncation;
+	[property: string]: unknown;
 }
 
 export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -48,9 +46,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface Text2TextGenerationOutput {
-    /**
-     * The generated text.
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
index 7b82c12c6..b6ec0e9b2 100644
--- a/packages/tasks/src/tasks/token-classification/inference.ts
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Token Classification inference
  */
 export interface TokenClassificationInput {
-    /**
-     * One or several texts which tokens are to be classified
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: TokenClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several texts which tokens are to be classified
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TokenClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,19 +25,19 @@ export interface TokenClassificationInput {
  * Additional inference parameters for Token Classification
  */
 export interface TokenClassificationParameters {
-    /**
-     * The strategy used to fuse tokens based on model predictions
-     */
-    aggregationStrategy?: AggregationStrategy;
-    /**
-     * A list of labels to ignore
-     */
-    ignoreLabels?: string[];
-    /**
-     * The number of overlapping tokens between chunks when splitting the input text.
-     */
-    stride?: number;
-    [property: string]: unknown;
+	/**
+	 * The strategy used to fuse tokens based on model predictions
+	 */
+	aggregationStrategy?: AggregationStrategy;
+	/**
+	 * A list of labels to ignore
+	 */
+	ignoreLabels?: string[];
+	/**
+	 * The number of overlapping tokens between chunks when splitting the input text.
+	 */
+	stride?: number;
+	[property: string]: unknown;
 }
 
 /**
@@ -62,26 +60,26 @@ export type AggregationStrategy = "none" | "simple" | "first" | "average" | "max
  * Outputs of inference for the Token Classification task
  */
 export interface TokenClassificationOutput {
-    /**
-     * The character position in the input where this group ends.
-     */
-    end?: number;
-    /**
-     * The predicted label for that group of tokens
-     */
-    entityGroup?: string;
-    label:        unknown;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    /**
-     * The character position in the input where this group begins.
-     */
-    start?: number;
-    /**
-     * The corresponding text
-     */
-    word?: string;
-    [property: string]: unknown;
+	/**
+	 * The character position in the input where this group ends.
+	 */
+	end?: number;
+	/**
+	 * The predicted label for that group of tokens
+	 */
+	entityGroup?: string;
+	label: unknown;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	/**
+	 * The character position in the input where this group begins.
+	 */
+	start?: number;
+	/**
+	 * The corresponding text
+	 */
+	word?: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
index 3b059542a..26786d8e5 100644
--- a/packages/tasks/src/tasks/translation/inference.ts
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -1,26 +1,24 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Translation inference
  *
  * Inputs for Text2text Generation inference
  */
 export interface TranslationInput {
-    /**
-     * One or more texts to use for text2text generation
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: Text2TextGenerationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or more texts to use for text2text generation
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Text2TextGenerationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -29,19 +27,19 @@ export interface TranslationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-    /**
-     * Whether to clean up the potential extra spaces in the text output.
-     */
-    cleanUpTokenizationSpaces?: boolean;
-    /**
-     * Additional parametrization of the text generation algorithm
-     */
-    parameters?: { [key: string]: unknown };
-    /**
-     * The truncation strategy to use
-     */
-    truncation?: Truncation;
-    [property: string]: unknown;
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	parameters?: { [key: string]: unknown };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Truncation;
+	[property: string]: unknown;
 }
 
 export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -52,9 +50,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface TranslationOutput {
-    /**
-     * The generated text.
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
index d98b4300e..29b3fed09 100644
--- a/packages/tasks/src/tasks/video-classification/inference.ts
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Video Classification inference
  */
 export interface VideoClassificationInput {
-    /**
-     * One or several videos to be classified
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: VideoClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several videos to be classified
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: VideoClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,32 +25,32 @@ export interface VideoClassificationInput {
  * Additional inference parameters for Video Classification
  */
 export interface VideoClassificationParameters {
-    /**
-     * The sampling rate used to select frames from the video.
-     */
-    frameSamplingRate?: number;
-    /**
-     * The number of sampled frames to consider for classification.
-     */
-    numFrames?: number;
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * The sampling rate used to select frames from the video.
+	 */
+	frameSamplingRate?: number;
+	/**
+	 * The number of sampled frames to consider for classification.
+	 */
+	numFrames?: number;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Video Classification task
  */
 export interface VideoClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
index da28c988e..3b9070447 100644
--- a/packages/tasks/src/tasks/visual-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -1,36 +1,34 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Visual Question Answering inference
  */
 export interface VisualQuestionAnsweringInput {
-    /**
-     * One or more image-question pairs
-     */
-    inputs: VisualQuestionAnsweringInputSingle[] | VisualQuestionAnsweringInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: VisualQuestionAnsweringParameters;
-    [property: string]: unknown;
+	/**
+	 * One or more image-question pairs
+	 */
+	inputs: VisualQuestionAnsweringInputSingle[] | VisualQuestionAnsweringInputSingle;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: VisualQuestionAnsweringParameters;
+	[property: string]: unknown;
 }
 
 export interface VisualQuestionAnsweringInputSingle {
-    /**
-     * The image.
-     */
-    image: unknown;
-    /**
-     * The question to answer based on the image.
-     */
-    question: unknown;
-    [property: string]: unknown;
+	/**
+	 * The image.
+	 */
+	image: unknown;
+	/**
+	 * The question to answer based on the image.
+	 */
+	question: unknown;
+	[property: string]: unknown;
 }
 
 /**
@@ -39,27 +37,27 @@ export interface VisualQuestionAnsweringInputSingle {
  * Additional inference parameters for Visual Question Answering
  */
 export interface VisualQuestionAnsweringParameters {
-    /**
-     * The number of answers to return (will be chosen by order of likelihood). Note that we
-     * return less than topk answers if there are not enough options available within the
-     * context.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Note that we
+	 * return less than topk answers if there are not enough options available within the
+	 * context.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Visual Question Answering task
  */
 export interface VisualQuestionAnsweringOutput {
-    /**
-     * The answer to the question
-     */
-    answer?: string;
-    label:   unknown;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The answer to the question
+	 */
+	answer?: string;
+	label: unknown;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
index 59b7cc3cd..564c6ba62 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -1,36 +1,34 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Zero Shot Classification inference
  */
 export interface ZeroShotClassificationInput {
-    /**
-     * One or several text + candidate labels pairs to classify
-     */
-    inputs: ZeroShotClassificationInputSingle[] | ZeroShotClassificationInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ZeroShotClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several text + candidate labels pairs to classify
+	 */
+	inputs: ZeroShotClassificationInputSingle[] | ZeroShotClassificationInputSingle;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ZeroShotClassificationParameters;
+	[property: string]: unknown;
 }
 
 export interface ZeroShotClassificationInputSingle {
-    /**
-     * The set of possible class labels to classify the text into.
-     */
-    candidateLabels: string[];
-    /**
-     * The text to classify
-     */
-    text: string;
-    [property: string]: unknown;
+	/**
+	 * The set of possible class labels to classify the text into.
+	 */
+	candidateLabels: string[];
+	/**
+	 * The text to classify
+	 */
+	text: string;
+	[property: string]: unknown;
 }
 
 /**
@@ -39,31 +37,31 @@ export interface ZeroShotClassificationInputSingle {
  * Additional inference parameters for Zero Shot Classification
  */
 export interface ZeroShotClassificationParameters {
-    /**
-     * The sentence used in conjunction with candidateLabels to attempt the text classification
-     * by replacing the placeholder with the candidate labels.
-     */
-    hypothesisTemplate?: string;
-    /**
-     * Whether multiple candidate labels can be true. If false, the scores are normalized such
-     * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
-     * considered independent and probabilities are normalized for each candidate.
-     */
-    multiLabel?: boolean;
-    [property: string]: unknown;
+	/**
+	 * The sentence used in conjunction with candidateLabels to attempt the text classification
+	 * by replacing the placeholder with the candidate labels.
+	 */
+	hypothesisTemplate?: string;
+	/**
+	 * Whether multiple candidate labels can be true. If false, the scores are normalized such
+	 * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
+	 * considered independent and probabilities are normalized for each candidate.
+	 */
+	multiLabel?: boolean;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Classification task
  */
 export interface ZeroShotClassificationOutput {
-    /**
-     * A candidate label
-     */
-    label: string;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 38aafb6a1..0976094a4 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -1,36 +1,34 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Zero Shot Image Classification inference
  */
 export interface ZeroShotImageClassificationInput {
-    /**
-     * One or several images to classify
-     */
-    inputs: ZeroShotImageClassificationInputSingle[] | ZeroShotImageClassificationInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ZeroShotImageClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several images to classify
+	 */
+	inputs: ZeroShotImageClassificationInputSingle[] | ZeroShotImageClassificationInputSingle;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ZeroShotImageClassificationParameters;
+	[property: string]: unknown;
 }
 
 export interface ZeroShotImageClassificationInputSingle {
-    /**
-     * The candidate labels for this image
-     */
-    candidateLabels: string[];
-    /**
-     * The image data to classify
-     */
-    image: unknown;
-    [property: string]: unknown;
+	/**
+	 * The candidate labels for this image
+	 */
+	candidateLabels: string[];
+	/**
+	 * The image data to classify
+	 */
+	image: unknown;
+	[property: string]: unknown;
 }
 
 /**
@@ -39,25 +37,25 @@ export interface ZeroShotImageClassificationInputSingle {
  * Additional inference parameters for Zero Shot Image Classification
  */
 export interface ZeroShotImageClassificationParameters {
-    /**
-     * The sentence used in conjunction with candidateLabels to attempt the text classification
-     * by replacing the placeholder with the candidate labels.
-     */
-    hypothesisTemplate?: string;
-    [property: string]: unknown;
+	/**
+	 * The sentence used in conjunction with candidateLabels to attempt the text classification
+	 * by replacing the placeholder with the candidate labels.
+	 */
+	hypothesisTemplate?: string;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Image Classification task
  */
 export interface ZeroShotImageClassificationOutput {
-    /**
-     * A candidate label
-     */
-    label: string;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index e9ef360bf..de136d6c3 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -1,56 +1,54 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Zero Shot Object Detection inference
  */
 export interface ZeroShotObjectDetectionInput {
-    /**
-     * One or several images to perform object detection on
-     */
-    inputs: ZeroShotObjectDetectionInputSingle[] | ZeroShotObjectDetectionInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * One or several images to perform object detection on
+	 */
+	inputs: ZeroShotObjectDetectionInputSingle[] | ZeroShotObjectDetectionInputSingle;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 export interface ZeroShotObjectDetectionInputSingle {
-    /**
-     * The candidate labels for this image
-     */
-    candidateLabels: string[];
-    /**
-     * The image data to generate bounding boxes from
-     */
-    image: unknown;
-    [property: string]: unknown;
+	/**
+	 * The candidate labels for this image
+	 */
+	candidateLabels: string[];
+	/**
+	 * The image data to generate bounding boxes from
+	 */
+	image: unknown;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Object Detection task
  */
 export interface ZeroShotObjectDetectionOutput {
-    /**
-     * The predicted bounding box. Coordinates are relative to the top left corner of the input
-     * image.
-     */
-    box: Box;
-    /**
-     * A candidate label
-     */
-    label: string;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
+	 * image.
+	 */
+	box: Box;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
 
 /**
@@ -58,9 +56,9 @@ export interface ZeroShotObjectDetectionOutput {
  * image.
  */
 export interface Box {
-    xmax: number;
-    xmin: number;
-    ymax: number;
-    ymin: number;
-    [property: string]: unknown;
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: unknown;
 }

From 00501a603e32131b3cb4fbec20325f2c656ea1b6 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 11:09:08 +0100
Subject: [PATCH 23/51] Add canonicalId to TaskData

---
 packages/tasks/src/tasks/index.ts               | 1 +
 packages/tasks/src/tasks/summarization/data.ts  | 1 +
 packages/tasks/src/tasks/text-to-speech/data.ts | 1 +
 packages/tasks/src/tasks/translation/data.ts    | 1 +
 4 files changed, 4 insertions(+)

diff --git a/packages/tasks/src/tasks/index.ts b/packages/tasks/src/tasks/index.ts
index b0615dfeb..9e9425376 100644
--- a/packages/tasks/src/tasks/index.ts
+++ b/packages/tasks/src/tasks/index.ts
@@ -216,6 +216,7 @@ export interface TaskData {
 	datasets: ExampleRepo[];
 	demo: TaskDemo;
 	id: PipelineType;
+	canonicalId?: PipelineType;
 	isPlaceholder?: boolean;
 	label: string;
 	libraries: ModelLibraryKey[];
diff --git a/packages/tasks/src/tasks/summarization/data.ts b/packages/tasks/src/tasks/summarization/data.ts
index b13fa3d16..bd04453da 100644
--- a/packages/tasks/src/tasks/summarization/data.ts
+++ b/packages/tasks/src/tasks/summarization/data.ts
@@ -1,6 +1,7 @@
 import type { TaskDataCustom } from "..";
 
 const taskData: TaskDataCustom = {
+	canonicalId: "text2text-generation",
 	datasets: [
 		{
 			description:
diff --git a/packages/tasks/src/tasks/text-to-speech/data.ts b/packages/tasks/src/tasks/text-to-speech/data.ts
index 73560b7af..26c6f4837 100644
--- a/packages/tasks/src/tasks/text-to-speech/data.ts
+++ b/packages/tasks/src/tasks/text-to-speech/data.ts
@@ -1,6 +1,7 @@
 import type { TaskDataCustom } from "..";
 
 const taskData: TaskDataCustom = {
+	canonicalId: "text-to-audio",
 	datasets: [
 		{
 			description: "Thousands of short audio clips of a single speaker.",
diff --git a/packages/tasks/src/tasks/translation/data.ts b/packages/tasks/src/tasks/translation/data.ts
index c0e4c3a34..0edfab7b8 100644
--- a/packages/tasks/src/tasks/translation/data.ts
+++ b/packages/tasks/src/tasks/translation/data.ts
@@ -1,6 +1,7 @@
 import type { TaskDataCustom } from "..";
 
 const taskData: TaskDataCustom = {
+	canonicalId: "text2text-generation",
 	datasets: [
 		{
 			description: "A dataset of copyright-free books translated into 16 different languages.",

From 29fecc059b00ef9ceb5bb50c6f638a8b3373515b Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 11:13:32 +0100
Subject: [PATCH 24/51] Fix naming for bounding boxes types

---
 packages/tasks/src/tasks/object-detection/inference.ts        | 4 ++--
 packages/tasks/src/tasks/object-detection/spec/output.json    | 1 +
 .../tasks/src/tasks/zero-shot-object-detection/inference.ts   | 4 ++--
 .../src/tasks/zero-shot-object-detection/spec/output.json     | 1 +
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index 228063fc0..5675eb53a 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -40,7 +40,7 @@ export interface ObjectDetectionOutput {
 	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
 	 * image.
 	 */
-	box: Box;
+	box: BoundingBox;
 	/**
 	 * The predicted label for the bounding box
 	 */
@@ -56,7 +56,7 @@ export interface ObjectDetectionOutput {
  * The predicted bounding box. Coordinates are relative to the top left corner of the input
  * image.
  */
-export interface Box {
+export interface BoundingBox {
 	xmax: number;
 	xmin: number;
 	ymax: number;
diff --git a/packages/tasks/src/tasks/object-detection/spec/output.json b/packages/tasks/src/tasks/object-detection/spec/output.json
index 41d0ed887..450d96ed2 100644
--- a/packages/tasks/src/tasks/object-detection/spec/output.json
+++ b/packages/tasks/src/tasks/object-detection/spec/output.json
@@ -24,6 +24,7 @@
 	"$defs": {
 		"BoundingBox": {
 			"type": "object",
+			"title": "BoundingBox",
 			"properties": {
 				"xmin": {
 					"type": "integer"
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index de136d6c3..6493541d8 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -39,7 +39,7 @@ export interface ZeroShotObjectDetectionOutput {
 	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
 	 * image.
 	 */
-	box: Box;
+	box: BoundingBox;
 	/**
 	 * A candidate label
 	 */
@@ -55,7 +55,7 @@ export interface ZeroShotObjectDetectionOutput {
  * The predicted bounding box. Coordinates are relative to the top left corner of the input
  * image.
  */
-export interface Box {
+export interface BoundingBox {
 	xmax: number;
 	xmin: number;
 	ymax: number;
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
index 0e725af9e..171e81120 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
@@ -23,6 +23,7 @@
 	},
 	"$defs": {
 		"BoundingBox": {
+			"title": "BoundingBox",
 			"type": "object",
 			"properties": {
 				"xmin": {

From d220a9b75938d459a2d385b1fb33f11620a1b4eb Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 11:19:13 +0100
Subject: [PATCH 25/51] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Better=20names=20for?=
 =?UTF-8?q?=20intermediate=20types?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/src/tasks/image-segmentation/inference.ts      | 4 ++--
 packages/tasks/src/tasks/image-segmentation/spec/input.json   | 1 +
 packages/tasks/src/tasks/summarization/inference.ts           | 4 ++--
 packages/tasks/src/tasks/text-classification/inference.ts     | 4 ++--
 packages/tasks/src/tasks/text-classification/spec/input.json  | 1 +
 packages/tasks/src/tasks/text2text-generation/inference.ts    | 4 ++--
 packages/tasks/src/tasks/text2text-generation/spec/input.json | 1 +
 packages/tasks/src/tasks/token-classification/inference.ts    | 4 ++--
 packages/tasks/src/tasks/token-classification/spec/input.json | 1 +
 packages/tasks/src/tasks/translation/inference.ts             | 4 ++--
 10 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index 13c15cb72..8b5e6da56 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -36,7 +36,7 @@ export interface ImageSegmentationParameters {
 	/**
 	 * Segmentation task to be performed, depending on model capabilities.
 	 */
-	subtask?: Subtask;
+	subtask?: ImageSegmentationSubtask;
 	/**
 	 * Probability threshold to filter out predicted masks.
 	 */
@@ -44,7 +44,7 @@ export interface ImageSegmentationParameters {
 	[property: string]: unknown;
 }
 
-export type Subtask = "instance" | "panoptic" | "semantic";
+export type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
 
 /**
  * Outputs of inference for the Image Segmentation task
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/input.json b/packages/tasks/src/tasks/image-segmentation/spec/input.json
index 5e050b8c7..06a80028b 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/input.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/input.json
@@ -27,6 +27,7 @@
 					"description": "Mask overlap threshold to eliminate small, disconnected segments."
 				},
 				"subtask": {
+					"title": "ImageSegmentationSubtask",
 					"type": "string",
 					"description": "Segmentation task to be performed, depending on model capabilities.",
 					"oneOf": [
diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
index 4a2fd40a5..d38632cd4 100644
--- a/packages/tasks/src/tasks/summarization/inference.ts
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -38,11 +38,11 @@ export interface Text2TextGenerationParameters {
 	/**
 	 * The truncation strategy to use
 	 */
-	truncation?: Truncation;
+	truncation?: Text2TextGenerationTruncationStrategy;
 	[property: string]: unknown;
 }
 
-export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
 
 /**
  * Outputs for Summarization inference
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
index 33648fdd8..58a54af0c 100644
--- a/packages/tasks/src/tasks/text-classification/inference.ts
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -28,7 +28,7 @@ export interface TextClassificationParameters {
 	/**
 	 * The function to apply to the model outputs in order to retrieve the scores.
 	 */
-	functionToApply?: FunctionToApply;
+	functionToApply?: TextClassificationOutputTransform;
 	/**
 	 * When specified, limits the output to the top K most probable classes.
 	 */
@@ -36,7 +36,7 @@ export interface TextClassificationParameters {
 	[property: string]: unknown;
 }
 
-export type FunctionToApply = "sigmoid" | "softmax" | "none";
+export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
 
 /**
  * Outputs of inference for the Text Classification task
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
index af40fea2e..73b14c794 100644
--- a/packages/tasks/src/tasks/text-classification/spec/input.json
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -30,6 +30,7 @@
 			"type": "object",
 			"properties": {
 				"functionToApply": {
+					"title": "TextClassificationOutputTransform",
 					"type": "string",
 					"description": "The function to apply to the model outputs in order to retrieve the scores.",
 					"oneOf": [
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index 5241648d3..7d2c7182e 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -36,11 +36,11 @@ export interface Text2TextGenerationParameters {
 	/**
 	 * The truncation strategy to use
 	 */
-	truncation?: Truncation;
+	truncation?: Text2TextGenerationTruncationStrategy;
 	[property: string]: unknown;
 }
 
-export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
 
 /**
  * Outputs of inference for the Text2text Generation task
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/input.json b/packages/tasks/src/tasks/text2text-generation/spec/input.json
index bec8fedfc..e8a0b9cd0 100644
--- a/packages/tasks/src/tasks/text2text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text2text-generation/spec/input.json
@@ -34,6 +34,7 @@
 					"description": "Whether to clean up the potential extra spaces in the text output."
 				},
 				"truncation": {
+					"title": "Text2textGenerationTruncationStrategy",
 					"type": "string",
 					"description": "The truncation strategy to use",
 					"oneOf": [
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
index b6ec0e9b2..fa18ba34b 100644
--- a/packages/tasks/src/tasks/token-classification/inference.ts
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -28,7 +28,7 @@ export interface TokenClassificationParameters {
 	/**
 	 * The strategy used to fuse tokens based on model predictions
 	 */
-	aggregationStrategy?: AggregationStrategy;
+	aggregationStrategy?: TokenClassificationAggregationStrategy;
 	/**
 	 * A list of labels to ignore
 	 */
@@ -54,7 +54,7 @@ export interface TokenClassificationParameters {
  * Similar to "simple", also preserves word integrity (uses the label with the highest score
  * across the word's tokens).
  */
-export type AggregationStrategy = "none" | "simple" | "first" | "average" | "max";
+export type TokenClassificationAggregationStrategy = "none" | "simple" | "first" | "average" | "max";
 
 /**
  * Outputs of inference for the Token Classification task
diff --git a/packages/tasks/src/tasks/token-classification/spec/input.json b/packages/tasks/src/tasks/token-classification/spec/input.json
index 8ca4b07d3..a2fcf5fdf 100644
--- a/packages/tasks/src/tasks/token-classification/spec/input.json
+++ b/packages/tasks/src/tasks/token-classification/spec/input.json
@@ -41,6 +41,7 @@
 					"description": "The number of overlapping tokens between chunks when splitting the input text."
 				},
 				"aggregationStrategy": {
+					"title": "TokenClassificationAggregationStrategy",
 					"type": "string",
 					"description": "The strategy used to fuse tokens based on model predictions",
 					"oneOf": [
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
index 26786d8e5..ecb108287 100644
--- a/packages/tasks/src/tasks/translation/inference.ts
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -38,11 +38,11 @@ export interface Text2TextGenerationParameters {
 	/**
 	 * The truncation strategy to use
 	 */
-	truncation?: Truncation;
+	truncation?: Text2TextGenerationTruncationStrategy;
 	[property: string]: unknown;
 }
 
-export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
 
 /**
  * Outputs for Translation inference

From 49a1d5027c24f7c022bdf3d9cdf4d6a55301809b Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 11:26:12 +0100
Subject: [PATCH 26/51] =?UTF-8?q?=E2=9C=A8=20Update=20placeholder?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/src/tasks/placeholder/data.ts  |  3 ++
 .../src/tasks/placeholder/spec/input.json     | 44 +++++++++++++++++++
 .../src/tasks/placeholder/spec/output.json    | 16 +++++++
 3 files changed, 63 insertions(+)
 create mode 100644 packages/tasks/src/tasks/placeholder/spec/input.json
 create mode 100644 packages/tasks/src/tasks/placeholder/spec/output.json

diff --git a/packages/tasks/src/tasks/placeholder/data.ts b/packages/tasks/src/tasks/placeholder/data.ts
index 0cbc735ad..110b43703 100644
--- a/packages/tasks/src/tasks/placeholder/data.ts
+++ b/packages/tasks/src/tasks/placeholder/data.ts
@@ -13,6 +13,9 @@ const taskData: TaskDataCustom = {
 	summary: "",
 	widgetModels: [],
 	youtubeId: undefined,
+	/// If this is a subtask, link to the most general task ID
+	/// (eg, text2text-generation is the canonical ID of translation)
+	canonicalId: undefined,
 };
 
 export default taskData;
diff --git a/packages/tasks/src/tasks/placeholder/spec/input.json b/packages/tasks/src/tasks/placeholder/spec/input.json
new file mode 100644
index 000000000..ad61eb7ae
--- /dev/null
+++ b/packages/tasks/src/tasks/placeholder/spec/input.json
@@ -0,0 +1,44 @@
+{
+	"$id": "/inference/schemas/<TASK_ID>/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for <TASK_ID> inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "TODO: describe the input here. This must be model & framework agnostic.",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/<TASK_ID>Parameters"
+		}
+	},
+	"$defs": {
+		"<TASK_ID>Parameters": {
+			"title": "<TASK_ID>Parameters",
+			"description": "TODO: describe additional parameters here.",
+			"type": "object",
+			"properties": {
+				"dummyParameterName": {
+					"type": "boolean",
+					"description": "TODO: describe the parameter here"
+				},
+				"dummyParameterName2": {
+					"type": "integer",
+					"description": "TODO: describe the parameter here"
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/placeholder/spec/output.json b/packages/tasks/src/tasks/placeholder/spec/output.json
new file mode 100644
index 000000000..b4b4225f6
--- /dev/null
+++ b/packages/tasks/src/tasks/placeholder/spec/output.json
@@ -0,0 +1,16 @@
+{
+	"$id": "/inference/schemas/<TASK_ID>/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs for <TASK_ID> inference",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"meaningfulOutputName": {
+				"type": "string",
+				"description": "TODO: Describe what is outputed by the inference here"
+			}
+		},
+		"required": ["meaningfulOutputName"]
+	}
+}

From f4784bf7436e963f3a94cab1b501ea76c191123a Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 19:48:09 +0100
Subject: [PATCH 27/51] Changes from code review

---
 .../src/tasks/audio-classification/inference.ts  |  2 +-
 .../tasks/audio-classification/spec/input.json   |  2 +-
 .../document-question-answering/inference.ts     |  2 +-
 .../document-question-answering/spec/input.json  |  2 +-
 .../src/tasks/image-classification/inference.ts  |  6 ++++++
 .../tasks/image-classification/spec/input.json   | 16 ++++++++++++++++
 .../src/tasks/image-segmentation/inference.ts    |  4 ++++
 .../tasks/image-segmentation/spec/output.json    |  4 ++++
 8 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index 2ac2e5065..9a108b61d 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -9,7 +9,7 @@
  */
 export interface AudioClassificationInput {
 	/**
-	 * On or several audio files to classify
+	 * One or several audio files to classify
 	 */
 	inputs: unknown;
 	/**
diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index 29357710d..685e92a0f 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -5,7 +5,7 @@
 	"type": "object",
 	"properties": {
 		"inputs": {
-			"description": "On or several audio files to classify"
+			"description": "One or several audio files to classify"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index 8dec0976f..d9f01c50f 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -9,7 +9,7 @@
  */
 export interface DocumentQuestionAnsweringInput {
 	/**
-	 * The
+	 * One or several document+question pairs to answer
 	 */
 	inputs: DocumentQuestionAnsweringInpu[] | DocumentQuestionAnsweringInpu;
 	/**
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
index 86d0708c5..394182f43 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -5,7 +5,7 @@
 	"type": "object",
 	"properties": {
 		"inputs": {
-			"description": "The ",
+			"description": "One or several document+question pairs to answer",
 			"anyOf": [
 				{
 					"$ref": "#/$defs/DocumentAndQuestion"
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index dfff0cfd9..92018d69f 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -25,6 +25,10 @@ export interface ImageClassificationInput {
  * Additional inference parameters for Image Classification
  */
 export interface ImageClassificationParameters {
+	/**
+	 * The function to apply to the model outputs in order to retrieve the scores.
+	 */
+	functionToApply?: ImageClassificationOutputTransform;
 	/**
 	 * When specified, limits the output to the top K most probable classes.
 	 */
@@ -32,6 +36,8 @@ export interface ImageClassificationParameters {
 	[property: string]: unknown;
 }
 
+export type ImageClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+
 /**
  * Outputs of inference for the Image Classification task
  */
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
index 875fae0e0..ecd23443d 100644
--- a/packages/tasks/src/tasks/image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -18,6 +18,22 @@
 			"description": "Additional inference parameters for Image Classification",
 			"type": "object",
 			"properties": {
+				"functionToApply": {
+					"title": "ImageClassificationOutputTransform",
+					"type": "string",
+					"description": "The function to apply to the model outputs in order to retrieve the scores.",
+					"oneOf": [
+						{
+							"const": "sigmoid"
+						},
+						{
+							"const": "softmax"
+						},
+						{
+							"const": "none"
+						}
+					]
+				},
 				"topK": {
 					"type": "integer",
 					"description": "When specified, limits the output to the top K most probable classes."
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index 8b5e6da56..5cd1af00f 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -60,5 +60,9 @@ export interface ImageSegmentationOutput {
 	 * The corresponding mask as a black-and-white image
 	 */
 	mask: unknown;
+	/**
+	 * The score or confidence degreee the model has
+	 */
+	score?: number;
 	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/output.json b/packages/tasks/src/tasks/image-segmentation/spec/output.json
index 80db732e3..4b7cb643c 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/output.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/output.json
@@ -13,6 +13,10 @@
 			},
 			"mask": {
 				"description": "The corresponding mask as a black-and-white image"
+			},
+			"score": {
+				"type": "number",
+				"description": "The score or confidence degreee the model has"
 			}
 		},
 		"required": ["label", "mask"]

From a33987fe219616a75c0e9fca79a65164472a9dd0 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 19:50:42 +0100
Subject: [PATCH 28/51] mark image & question as required in doc QA

---
 .../tasks/src/tasks/document-question-answering/inference.ts  | 4 ++--
 .../src/tasks/document-question-answering/spec/input.json     | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index d9f01c50f..092268360 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -23,11 +23,11 @@ export interface DocumentQuestionAnsweringInpu {
 	/**
 	 * The image on which the question is asked
 	 */
-	image?: unknown;
+	image: unknown;
 	/**
 	 * A question to ask of the document
 	 */
-	question?: string;
+	question: string;
 	[property: string]: unknown;
 }
 
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
index 394182f43..753790c93 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -88,7 +88,8 @@
 					"type": "string",
 					"description": "A question to ask of the document"
 				}
-			}
+			},
+			"required": ["image", "question"]
 		}
 	},
 	"required": ["inputs"]

From 6558af4db61b144ab2d521732b473abb31011b0a Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Wed, 24 Jan 2024 19:26:53 +0100
Subject: [PATCH 29/51] Document QA: rename input element to inputsingle

---
 .../src/tasks/document-question-answering/inference.ts     | 4 ++--
 .../src/tasks/document-question-answering/spec/input.json  | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index 092268360..f8963c0bd 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -11,7 +11,7 @@ export interface DocumentQuestionAnsweringInput {
 	/**
 	 * One or several document+question pairs to answer
 	 */
-	inputs: DocumentQuestionAnsweringInpu[] | DocumentQuestionAnsweringInpu;
+	inputs: DocumentQuestionAnsweringInputSingle[] | DocumentQuestionAnsweringInputSingle;
 	/**
 	 * Additional inference parameters
 	 */
@@ -19,7 +19,7 @@ export interface DocumentQuestionAnsweringInput {
 	[property: string]: unknown;
 }
 
-export interface DocumentQuestionAnsweringInpu {
+export interface DocumentQuestionAnsweringInputSingle {
 	/**
 	 * The image on which the question is asked
 	 */
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
index 753790c93..84d286e23 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -8,12 +8,12 @@
 			"description": "One or several document+question pairs to answer",
 			"anyOf": [
 				{
-					"$ref": "#/$defs/DocumentAndQuestion"
+					"$ref": "#/$defs/DocumentQuestionAnsweringInputSingle"
 				},
 				{
 					"type": "array",
 					"items": {
-						"$ref": "#/$defs/DocumentAndQuestion"
+						"$ref": "#/$defs/DocumentQuestionAnsweringInputSingle"
 					}
 				}
 			]
@@ -78,8 +78,9 @@
 				}
 			}
 		},
-		"DocumentAndQuestion": {
+		"DocumentQuestionAnsweringInputSingle": {
 			"type": "object",
+			"title": "DocumentQuestionAnsweringInputSingle",
 			"properties": {
 				"image": {
 					"description": "The image on which the question is asked"

From 0724e261fe94dea2676240d7fcef55a955902dbc Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Thu, 25 Jan 2024 11:24:42 +0100
Subject: [PATCH 30/51] No batching

---
 .vscode/settings.json                         |   5 +
 .../tasks/audio-classification/inference.ts   |  56 ++++---
 .../audio-classification/spec/input.json      |  22 ++-
 .../automatic-speech-recognition/inference.ts |  32 ++--
 .../spec/input.json                           |   4 +-
 .../src/tasks/depth-estimation/inference.ts   |  32 ++--
 .../tasks/depth-estimation/spec/input.json    |   4 +-
 .../document-question-answering/inference.ts  | 153 +++++++++---------
 .../spec/input.json                           |  18 +--
 .../src/tasks/feature-extraction/inference.ts |  24 +--
 .../tasks/feature-extraction/spec/input.json  |  22 +--
 .../tasks/src/tasks/fill-mask/inference.ts    |  80 ++++-----
 .../tasks/src/tasks/fill-mask/spec/input.json |  35 ++--
 .../tasks/image-classification/inference.ts   |  58 +++----
 .../image-classification/spec/input.json      |   6 +-
 .../src/tasks/image-segmentation/inference.ts |  82 +++++-----
 .../tasks/image-segmentation/spec/input.json  |   6 +-
 .../src/tasks/image-to-image/inference.ts     |  22 +--
 .../src/tasks/image-to-image/spec/input.json  |   6 +-
 .../src/tasks/image-to-text/inference.ts      |  42 ++---
 .../src/tasks/image-to-text/spec/input.json   |   6 +-
 .../src/tasks/object-detection/inference.ts   |  70 ++++----
 .../tasks/object-detection/spec/input.json    |   6 +-
 .../src/tasks/placeholder/spec/input.json     |   4 +-
 .../src/tasks/question-answering/inference.ts | 151 ++++++++---------
 .../tasks/question-answering/spec/input.json  |  40 ++---
 .../tasks/sentence-similarity/inference.ts    |  38 ++---
 .../tasks/sentence-similarity/spec/input.json |  31 ++--
 .../src/tasks/summarization/inference.ts      |  58 +++----
 .../table-question-answering/inference.ts     |  80 ++++-----
 .../table-question-answering/spec/input.json  |  23 +--
 .../tasks/text-classification/inference.ts    |  58 +++----
 .../tasks/text-classification/spec/input.json |  18 +--
 .../src/tasks/text-generation/inference.ts    | 128 +++++++--------
 .../src/tasks/text-generation/spec/input.json |  16 +-
 .../src/tasks/text-to-audio/inference.ts      |  40 ++---
 .../src/tasks/text-to-audio/spec/input.json   |  18 +--
 .../src/tasks/text-to-speech/inference.ts     |  40 ++---
 .../tasks/text2text-generation/inference.ts   |  58 +++----
 .../text2text-generation/spec/input.json      |  20 +--
 .../tasks/token-classification/inference.ts   |  92 +++++------
 .../token-classification/spec/input.json      |  18 +--
 .../tasks/src/tasks/translation/inference.ts  |  58 +++----
 .../tasks/video-classification/inference.ts   |  72 +++++----
 .../video-classification/spec/input.json      |  20 ++-
 .../visual-question-answering/inference.ts    |  77 ++++-----
 .../visual-question-answering/spec/input.json |  31 ++--
 .../zero-shot-classification/inference.ts     |  87 +++++-----
 .../zero-shot-classification/spec/input.json  |  33 ++--
 .../inference.ts                              |  73 +++++----
 .../spec/input.json                           |  31 ++--
 .../zero-shot-object-detection/inference.ts   |  81 +++++-----
 .../spec/input.json                           |  31 ++--
 53 files changed, 1131 insertions(+), 1185 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 5e40510b2..072ae9648 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -6,5 +6,10 @@
 	},
 	"[svelte]": {
 		"editor.defaultFormatter": "esbenp.prettier-vscode"
+	},
+	"prettier.configPath": ".prettierrc",
+	"json.format.enable": false,
+	"[json]": {
+		"editor.defaultFormatter": "esbenp.prettier-vscode"
 	}
 }
diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index 9a108b61d..1ba43fae1 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Audio Classification inference
  */
 export interface AudioClassificationInput {
-	/**
-	 * One or several audio files to classify
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: AudioClassificationParameters;
-	[property: string]: unknown;
+    /**
+     * The input audio data
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: AudioClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,24 +27,30 @@ export interface AudioClassificationInput {
  * Additional inference parameters for Audio Classification
  */
 export interface AudioClassificationParameters {
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: unknown;
+    /**
+     * The function to apply to the model outputs in order to retrieve the scores.
+     */
+    functionToApply?: AudioClassificationOutputTransform;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
+export type AudioClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+
 /**
  * Outputs for Audio Classification inference
  */
 export interface AudioClassificationOutput {
-	/**
-	 * The predicted class label (model specific).
-	 */
-	label: string;
-	/**
-	 * The corresponding probability.
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index 685e92a0f..60062756c 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Audio Classification inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several audio files to classify"
+		"input": {
+			"description": "The input audio data"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -18,6 +18,22 @@
 			"description": "Additional inference parameters for Audio Classification",
 			"type": "object",
 			"properties": {
+				"functionToApply": {
+					"title": "AudioClassificationOutputTransform",
+					"type": "string",
+					"description": "The function to apply to the model outputs in order to retrieve the scores.",
+					"oneOf": [
+						{
+							"const": "sigmoid"
+						},
+						{
+							"const": "softmax"
+						},
+						{
+							"const": "none"
+						}
+					]
+				},
 				"topK": {
 					"type": "integer",
 					"description": "When specified, limits the output to the top K most probable classes."
@@ -25,5 +41,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index 6eb20d0c1..7ddfe0055 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -1,31 +1,33 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Automatic Speech Recognition inference
  */
 export interface AutomaticSpeechRecognitionInput {
-	/**
-	 * The input audio data
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: unknown };
-	[property: string]: unknown;
+    /**
+     * The input audio data
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Automatic Speech Recognition task
  */
 export interface AutomaticSpeechRecognitionOutput {
-	/**
-	 * The recognized text.
-	 */
-	text: string;
-	[property: string]: unknown;
+    /**
+     * The recognized text.
+     */
+    text: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
index a4034b5e1..147851e57 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Automatic Speech Recognition inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
+		"input": {
 			"description": "The input audio data"
 		},
 		"parameters": {
@@ -20,5 +20,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/depth-estimation/inference.ts b/packages/tasks/src/tasks/depth-estimation/inference.ts
index 48b9d3438..9019043eb 100644
--- a/packages/tasks/src/tasks/depth-estimation/inference.ts
+++ b/packages/tasks/src/tasks/depth-estimation/inference.ts
@@ -1,24 +1,26 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 export type DepthEstimationOutput = unknown[];
 
 /**
  * Inputs for Depth Estimation inference
  */
 export interface DepthEstimationInput {
-	/**
-	 * The input image data
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: DepthEstimationParameters;
-	[property: string]: unknown;
+    /**
+     * The input image data
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: DepthEstimationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -27,9 +29,9 @@ export interface DepthEstimationInput {
  * Additional inference parameters for Depth Estimation
  */
 export interface DepthEstimationParameters {
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: unknown;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/input.json b/packages/tasks/src/tasks/depth-estimation/spec/input.json
index f33df6444..80f491974 100644
--- a/packages/tasks/src/tasks/depth-estimation/spec/input.json
+++ b/packages/tasks/src/tasks/depth-estimation/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Depth Estimation inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
+		"input": {
 			"description": "The input image data"
 		},
 		"parameters": {
@@ -25,5 +25,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index f8963c0bd..a04b3b39b 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -1,34 +1,39 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Document Question Answering inference
  */
 export interface DocumentQuestionAnsweringInput {
-	/**
-	 * One or several document+question pairs to answer
-	 */
-	inputs: DocumentQuestionAnsweringInputSingle[] | DocumentQuestionAnsweringInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: DocumentQuestionAnsweringParameters;
-	[property: string]: unknown;
+    /**
+     * One (document, question) pair to answer
+     */
+    input: DocumentQuestionAnsweringInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: DocumentQuestionAnsweringParameters;
+    [property: string]: unknown;
 }
 
+/**
+ * One (document, question) pair to answer
+ */
 export interface DocumentQuestionAnsweringInputSingle {
-	/**
-	 * The image on which the question is asked
-	 */
-	image: unknown;
-	/**
-	 * A question to ask of the document
-	 */
-	question: string;
-	[property: string]: unknown;
+    /**
+     * The image on which the question is asked
+     */
+    image: unknown;
+    /**
+     * A question to ask of the document
+     */
+    question: string;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,65 +42,65 @@ export interface DocumentQuestionAnsweringInputSingle {
  * Additional inference parameters for Document Question Answering
  */
 export interface DocumentQuestionAnsweringParameters {
-	/**
-	 * If the words in the document are too long to fit with the question for the model, it will
-	 * be split in several chunks with some overlap. This argument controls the size of that
-	 * overlap.
-	 */
-	docStride?: number;
-	/**
-	 * Whether to accept impossible as an answer
-	 */
-	handleImpossibleAnswer?: boolean;
-	/**
-	 * Language to use while running OCR. Defaults to english.
-	 */
-	lang?: string;
-	/**
-	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
-	 * considered).
-	 */
-	maxAnswerLen?: number;
-	/**
-	 * The maximum length of the question after tokenization. It will be truncated if needed.
-	 */
-	maxQuestionLen?: number;
-	/**
-	 * The maximum length of the total sentence (context + question) in tokens of each chunk
-	 * passed to the model. The context will be split in several chunks (using doc_stride as
-	 * overlap) if needed.
-	 */
-	maxSeqLen?: number;
-	/**
-	 * The number of answers to return (will be chosen by order of likelihood). Can return less
-	 * than top_k answers if there are not enough options available within the context.
-	 */
-	topK?: number;
-	/**
-	 * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
-	 * skip the OCR step and use the provided bounding boxes instead.
-	 */
-	wordBoxes?: Array<number[] | string>;
-	[property: string]: unknown;
+    /**
+     * If the words in the document are too long to fit with the question for the model, it will
+     * be split in several chunks with some overlap. This argument controls the size of that
+     * overlap.
+     */
+    docStride?: number;
+    /**
+     * Whether to accept impossible as an answer
+     */
+    handleImpossibleAnswer?: boolean;
+    /**
+     * Language to use while running OCR. Defaults to english.
+     */
+    lang?: string;
+    /**
+     * The maximum length of predicted answers (e.g., only answers with a shorter length are
+     * considered).
+     */
+    maxAnswerLen?: number;
+    /**
+     * The maximum length of the question after tokenization. It will be truncated if needed.
+     */
+    maxQuestionLen?: number;
+    /**
+     * The maximum length of the total sentence (context + question) in tokens of each chunk
+     * passed to the model. The context will be split in several chunks (using doc_stride as
+     * overlap) if needed.
+     */
+    maxSeqLen?: number;
+    /**
+     * The number of answers to return (will be chosen by order of likelihood). Can return less
+     * than top_k answers if there are not enough options available within the context.
+     */
+    topK?: number;
+    /**
+     * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
+     * skip the OCR step and use the provided bounding boxes instead.
+     */
+    wordBoxes?: Array<number[] | string>;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Document Question Answering task
  */
 export interface DocumentQuestionAnsweringOutput {
-	/**
-	 * The answer to the question.
-	 */
-	answer: string;
-	end: number;
-	/**
-	 * The probability associated to the answer.
-	 */
-	score: number;
-	start: number;
-	/**
-	 * The index of each word/box pair that is in the answer
-	 */
-	words: number[];
-	[property: string]: unknown;
+    /**
+     * The answer to the question.
+     */
+    answer: string;
+    end:    number;
+    /**
+     * The probability associated to the answer.
+     */
+    score: number;
+    start: number;
+    /**
+     * The index of each word/box pair that is in the answer
+     */
+    words: number[];
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
index 84d286e23..580a95c92 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -4,19 +4,9 @@
 	"description": "Inputs for Document Question Answering inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several document+question pairs to answer",
-			"anyOf": [
-				{
-					"$ref": "#/$defs/DocumentQuestionAnsweringInputSingle"
-				},
-				{
-					"type": "array",
-					"items": {
-						"$ref": "#/$defs/DocumentQuestionAnsweringInputSingle"
-					}
-				}
-			]
+		"input": {
+			"description": "One (document, question) pair to answer",
+			"$ref": "#/$defs/DocumentQuestionAnsweringInputSingle"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -93,5 +83,5 @@
 			"required": ["image", "question"]
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
index 5c237d6dd..d8674f516 100644
--- a/packages/tasks/src/tasks/feature-extraction/inference.ts
+++ b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -1,20 +1,22 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
- * Inputs for Feature Extraction inference
+ * Inputs for Text Embedding inference
  */
 export interface FeatureExtractionInput {
-	/**
-	 * One or several texts to get the features of
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: unknown };
-	[property: string]: unknown;
+    /**
+     * The text to get the embeddings of
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/input.json b/packages/tasks/src/tasks/feature-extraction/spec/input.json
index 8bf05339a..e2eadc4e9 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/input.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/input.json
@@ -1,22 +1,12 @@
 {
-	"$id": "/inference/schemas/feature-extraction/input.json",
+	"$id": "/inference/schemas/text-embedding/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "Inputs for Feature Extraction inference",
+	"description": "Inputs for Text Embedding inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several texts to get the features of",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
+		"input": {
+			"description": "The text to get the embeddings of",
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -31,5 +21,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/fill-mask/inference.ts b/packages/tasks/src/tasks/fill-mask/inference.ts
index 097718900..e01feec2f 100644
--- a/packages/tasks/src/tasks/fill-mask/inference.ts
+++ b/packages/tasks/src/tasks/fill-mask/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Fill Mask inference
  */
 export interface FillMaskInput {
-	/**
-	 * One or several texts with masked tokens
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: FillMaskParameters;
-	[property: string]: unknown;
+    /**
+     * The text with masked tokens
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: FillMaskParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,39 +27,39 @@ export interface FillMaskInput {
  * Additional inference parameters for Fill Mask
  */
 export interface FillMaskParameters {
-	/**
-	 * When passed, the model will limit the scores to the passed targets instead of looking up
-	 * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
-	 * tokenized and the first resulting token will be used (with a warning, and that might be
-	 * slower).
-	 */
-	targets?: string[] | string;
-	/**
-	 * When passed, overrides the number of predictions to return.
-	 */
-	topK?: number;
-	[property: string]: unknown;
+    /**
+     * When passed, the model will limit the scores to the passed targets instead of looking up
+     * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
+     * tokenized and the first resulting token will be used (with a warning, and that might be
+     * slower).
+     */
+    targets?: string[];
+    /**
+     * When passed, overrides the number of predictions to return.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Fill Mask task
  */
 export interface FillMaskOutput {
-	/**
-	 * The corresponding probability
-	 */
-	score: number;
-	/**
-	 * The corresponding input with the mask token prediction.
-	 */
-	sequence: string;
-	/**
-	 * The predicted token id (to replace the masked one).
-	 */
-	token: number;
-	/**
-	 * The predicted token (to replace the masked one).
-	 */
-	tokenStr: string;
-	[property: string]: unknown;
+    /**
+     * The corresponding probability
+     */
+    score: number;
+    /**
+     * The corresponding input with the mask token prediction.
+     */
+    sequence: string;
+    /**
+     * The predicted token id (to replace the masked one).
+     */
+    token: number;
+    /**
+     * The predicted token (to replace the masked one).
+     */
+    tokenStr: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/fill-mask/spec/input.json b/packages/tasks/src/tasks/fill-mask/spec/input.json
index 6f7402efb..62f935fe2 100644
--- a/packages/tasks/src/tasks/fill-mask/spec/input.json
+++ b/packages/tasks/src/tasks/fill-mask/spec/input.json
@@ -4,19 +4,9 @@
 	"description": "Inputs for Fill Mask inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several texts with masked tokens",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
+		"input": {
+			"description": "The text with masked tokens",
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -34,21 +24,14 @@
 					"description": "When passed, overrides the number of predictions to return."
 				},
 				"targets": {
-					"anyOf": [
-						{
-							"type": "string"
-						},
-						{
-							"type": "array",
-							"items": {
-								"type": "string"
-							}
-						}
-					],
-					"description": "When passed, the model will limit the scores to the passed targets instead of looking up in the whole vocabulary. If the provided targets are not in the model vocab, they will be tokenized and the first resulting token will be used (with a warning, and that might be slower)."
+					"description": "When passed, the model will limit the scores to the passed targets instead of looking up in the whole vocabulary. If the provided targets are not in the model vocab, they will be tokenized and the first resulting token will be used (with a warning, and that might be slower).",
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
 				}
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index 92018d69f..488531255 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Image Classification inference
  */
 export interface ImageClassificationInput {
-	/**
-	 * On or several image files to classify
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ImageClassificationParameters;
-	[property: string]: unknown;
+    /**
+     * The input image data
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ImageClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,15 +27,15 @@ export interface ImageClassificationInput {
  * Additional inference parameters for Image Classification
  */
 export interface ImageClassificationParameters {
-	/**
-	 * The function to apply to the model outputs in order to retrieve the scores.
-	 */
-	functionToApply?: ImageClassificationOutputTransform;
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: unknown;
+    /**
+     * The function to apply to the model outputs in order to retrieve the scores.
+     */
+    functionToApply?: ImageClassificationOutputTransform;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 export type ImageClassificationOutputTransform = "sigmoid" | "softmax" | "none";
@@ -42,13 +44,13 @@ export type ImageClassificationOutputTransform = "sigmoid" | "softmax" | "none";
  * Outputs of inference for the Image Classification task
  */
 export interface ImageClassificationOutput {
-	/**
-	 * The predicted class label (model specific).
-	 */
-	label: string;
-	/**
-	 * The corresponding probability.
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
index ecd23443d..fec2d9aa0 100644
--- a/packages/tasks/src/tasks/image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Image Classification inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "On or several image files to classify"
+		"input": {
+			"description": "The input image data"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -41,5 +41,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index 5cd1af00f..ba31379ef 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Image Segmentation inference
  */
 export interface ImageSegmentationInput {
-	/**
-	 * One or several image files to perform segmentation on
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ImageSegmentationParameters;
-	[property: string]: unknown;
+    /**
+     * The input image data
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ImageSegmentationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,23 +27,23 @@ export interface ImageSegmentationInput {
  * Additional inference parameters for Image Segmentation
  */
 export interface ImageSegmentationParameters {
-	/**
-	 * Threshold to use when turning the predicted masks into binary values.
-	 */
-	maskThreshold?: number;
-	/**
-	 * Mask overlap threshold to eliminate small, disconnected segments.
-	 */
-	overlapMaskAreaThreshold?: number;
-	/**
-	 * Segmentation task to be performed, depending on model capabilities.
-	 */
-	subtask?: ImageSegmentationSubtask;
-	/**
-	 * Probability threshold to filter out predicted masks.
-	 */
-	threshold?: number;
-	[property: string]: unknown;
+    /**
+     * Threshold to use when turning the predicted masks into binary values.
+     */
+    maskThreshold?: number;
+    /**
+     * Mask overlap threshold to eliminate small, disconnected segments.
+     */
+    overlapMaskAreaThreshold?: number;
+    /**
+     * Segmentation task to be performed, depending on model capabilities.
+     */
+    subtask?: ImageSegmentationSubtask;
+    /**
+     * Probability threshold to filter out predicted masks.
+     */
+    threshold?: number;
+    [property: string]: unknown;
 }
 
 export type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
@@ -52,17 +54,17 @@ export type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
  * A predicted mask / segment
  */
 export interface ImageSegmentationOutput {
-	/**
-	 * The label of the predicted segment
-	 */
-	label: string;
-	/**
-	 * The corresponding mask as a black-and-white image
-	 */
-	mask: unknown;
-	/**
-	 * The score or confidence degreee the model has
-	 */
-	score?: number;
-	[property: string]: unknown;
+    /**
+     * The label of the predicted segment
+     */
+    label: string;
+    /**
+     * The corresponding mask as a black-and-white image
+     */
+    mask: unknown;
+    /**
+     * The score or confidence degreee the model has
+     */
+    score?: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/input.json b/packages/tasks/src/tasks/image-segmentation/spec/input.json
index 06a80028b..5e2a115e3 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/input.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Image Segmentation inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several image files to perform segmentation on"
+		"input": {
+			"description": "The input image data"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -49,5 +49,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
index c1f1a5cb8..6ee8d47ba 100644
--- a/packages/tasks/src/tasks/image-to-image/inference.ts
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 export type ImageToImageOutput = unknown[];
 
 /**
  * Inputs for Image To Image inference
  */
 export interface ImageToImageInput {
-	/**
-	 * One or more images to generate images from
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: unknown;
-	[property: string]: unknown;
+    /**
+     * The input image data
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: unknown;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-image/spec/input.json b/packages/tasks/src/tasks/image-to-image/spec/input.json
index 38b1202ef..61653d3c3 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Image To Image inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or more images to generate images from"
+		"input": {
+			"description": "The input image data"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -18,5 +18,5 @@
 			"description": "Additional inference parameters for Image To Image"
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index 029db76da..d4430b5c5 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Image To Text inference
  */
 export interface ImageToTextInput {
-	/**
-	 * One or several images to generated text for
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ImageToTextParameters;
-	[property: string]: unknown;
+    /**
+     * The input image data
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ImageToTextParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,20 +27,20 @@ export interface ImageToTextInput {
  * Additional inference parameters for Image To Text
  */
 export interface ImageToTextParameters {
-	/**
-	 * The amount of maximum tokens to generate.
-	 */
-	maxNewTokens?: number;
-	[property: string]: unknown;
+    /**
+     * The amount of maximum tokens to generate.
+     */
+    maxNewTokens?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Image To Text task
  */
 export interface ImageToTextOutput {
-	/**
-	 * The generated text.
-	 */
-	generatedText: string;
-	[property: string]: unknown;
+    /**
+     * The generated text.
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-text/spec/input.json b/packages/tasks/src/tasks/image-to-text/spec/input.json
index 140f9e27e..0ae6331c5 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Image To Text inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several images to generated text for"
+		"input": {
+			"description": "The input image data"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -25,5 +25,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index 5675eb53a..0d38adb58 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Object Detection inference
  */
 export interface ObjectDetectionInput {
-	/**
-	 * One or several input images to perform object detection on
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ObjectDetectionParameters;
-	[property: string]: unknown;
+    /**
+     * The input image data
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ObjectDetectionParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,31 +27,31 @@ export interface ObjectDetectionInput {
  * Additional inference parameters for Object Detection
  */
 export interface ObjectDetectionParameters {
-	/**
-	 * The probability necessary to make a prediction.
-	 */
-	threshold?: number;
-	[property: string]: unknown;
+    /**
+     * The probability necessary to make a prediction.
+     */
+    threshold?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Object Detection task
  */
 export interface ObjectDetectionOutput {
-	/**
-	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
-	 * image.
-	 */
-	box: BoundingBox;
-	/**
-	 * The predicted label for the bounding box
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * The predicted bounding box. Coordinates are relative to the top left corner of the input
+     * image.
+     */
+    box: BoundingBox;
+    /**
+     * The predicted label for the bounding box
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
 
 /**
@@ -57,9 +59,9 @@ export interface ObjectDetectionOutput {
  * image.
  */
 export interface BoundingBox {
-	xmax: number;
-	xmin: number;
-	ymax: number;
-	ymin: number;
-	[property: string]: unknown;
+    xmax: number;
+    xmin: number;
+    ymax: number;
+    ymin: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/object-detection/spec/input.json b/packages/tasks/src/tasks/object-detection/spec/input.json
index f8647e78a..5055f2e17 100644
--- a/packages/tasks/src/tasks/object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/object-detection/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Object Detection inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several input images to perform object detection on"
+		"input": {
+			"description": "The input image data"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -25,5 +25,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/placeholder/spec/input.json b/packages/tasks/src/tasks/placeholder/spec/input.json
index ad61eb7ae..8dc1c3261 100644
--- a/packages/tasks/src/tasks/placeholder/spec/input.json
+++ b/packages/tasks/src/tasks/placeholder/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for <TASK_ID> inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
+		"input": {
 			"description": "TODO: describe the input here. This must be model & framework agnostic.",
 			"anyOf": [
 				{
@@ -40,5 +40,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
index 58da43f6d..7c3dd476a 100644
--- a/packages/tasks/src/tasks/question-answering/inference.ts
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -1,34 +1,39 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Question Answering inference
  */
 export interface QuestionAnsweringInput {
-	/**
-	 * One or several question+context pairs to answer
-	 */
-	inputs: SquadExample[] | SquadExample;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: QuestionAnsweringParameters;
-	[property: string]: unknown;
+    /**
+     * One (context, question) pair to answer
+     */
+    input: Input;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: QuestionAnsweringParameters;
+    [property: string]: unknown;
 }
 
-export interface SquadExample {
-	/**
-	 * The context to be used for answering the question
-	 */
-	context: string;
-	/**
-	 * The question to be answered
-	 */
-	question: string;
-	[property: string]: unknown;
+/**
+ * One (context, question) pair to answer
+ */
+export interface Input {
+    /**
+     * The context to be used for answering the question
+     */
+    context: string;
+    /**
+     * The question to be answered
+     */
+    question: string;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,63 +42,63 @@ export interface SquadExample {
  * Additional inference parameters for Question Answering
  */
 export interface QuestionAnsweringParameters {
-	/**
-	 * Attempts to align the answer to real words. Improves quality on space separated
-	 * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
-	 */
-	alignToWords?: boolean;
-	/**
-	 * If the context is too long to fit with the question for the model, it will be split in
-	 * several chunks with some overlap. This argument controls the size of that overlap.
-	 */
-	docStride?: number;
-	/**
-	 * Whether to accept impossible as an answer.
-	 */
-	handleImpossibleAnswer?: boolean;
-	/**
-	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
-	 * considered).
-	 */
-	maxAnswerLen?: number;
-	/**
-	 * The maximum length of the question after tokenization. It will be truncated if needed.
-	 */
-	maxQuestionLen?: number;
-	/**
-	 * The maximum length of the total sentence (context + question) in tokens of each chunk
-	 * passed to the model. The context will be split in several chunks (using docStride as
-	 * overlap) if needed.
-	 */
-	maxSeqLen?: number;
-	/**
-	 * The number of answers to return (will be chosen by order of likelihood). Note that we
-	 * return less than topk answers if there are not enough options available within the
-	 * context.
-	 */
-	topK?: number;
-	[property: string]: unknown;
+    /**
+     * Attempts to align the answer to real words. Improves quality on space separated
+     * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
+     */
+    alignToWords?: boolean;
+    /**
+     * If the context is too long to fit with the question for the model, it will be split in
+     * several chunks with some overlap. This argument controls the size of that overlap.
+     */
+    docStride?: number;
+    /**
+     * Whether to accept impossible as an answer.
+     */
+    handleImpossibleAnswer?: boolean;
+    /**
+     * The maximum length of predicted answers (e.g., only answers with a shorter length are
+     * considered).
+     */
+    maxAnswerLen?: number;
+    /**
+     * The maximum length of the question after tokenization. It will be truncated if needed.
+     */
+    maxQuestionLen?: number;
+    /**
+     * The maximum length of the total sentence (context + question) in tokens of each chunk
+     * passed to the model. The context will be split in several chunks (using docStride as
+     * overlap) if needed.
+     */
+    maxSeqLen?: number;
+    /**
+     * The number of answers to return (will be chosen by order of likelihood). Note that we
+     * return less than topk answers if there are not enough options available within the
+     * context.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Question Answering task
  */
 export interface QuestionAnsweringOutput {
-	/**
-	 * The answer to the question.
-	 */
-	answer: string;
-	/**
-	 * The character position in the input where the answer ends.
-	 */
-	end: number;
-	/**
-	 * The probability associated to the answer.
-	 */
-	score: number;
-	/**
-	 * The character position in the input where the answer begins.
-	 */
-	start: number;
-	[property: string]: unknown;
+    /**
+     * The answer to the question.
+     */
+    answer: string;
+    /**
+     * The character position in the input where the answer ends.
+     */
+    end: number;
+    /**
+     * The probability associated to the answer.
+     */
+    score: number;
+    /**
+     * The character position in the input where the answer begins.
+     */
+    start: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/question-answering/spec/input.json b/packages/tasks/src/tasks/question-answering/spec/input.json
index 9eab32e13..da38a8f8a 100644
--- a/packages/tasks/src/tasks/question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/question-answering/spec/input.json
@@ -4,41 +4,27 @@
 	"description": "Inputs for Question Answering inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several question+context pairs to answer",
-			"anyOf": [
-				{
-					"$ref": "#/$defs/SquadExample"
-				},
-				{
-					"type": "array",
-					"items": {
-						"$ref": "#/$defs/SquadExample"
-					}
-				}
-			]
-		},
-		"parameters": {
-			"description": "Additional inference parameters",
-			"$ref": "#/$defs/QuestionAnsweringParameters"
-		}
-	},
-	"$defs": {
-		"SquadExample": {
-			"title": "SquadExample",
+		"input": {
+			"description": "One (context, question) pair to answer",
 			"type": "object",
 			"properties": {
-				"question": {
-					"type": "string",
-					"description": "The question to be answered"
-				},
 				"context": {
 					"type": "string",
 					"description": "The context to be used for answering the question"
+				},
+				"question": {
+					"type": "string",
+					"description": "The question to be answered"
 				}
 			},
 			"required": ["question", "context"]
 		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/QuestionAnsweringParameters"
+		}
+	},
+	"$defs": {
 		"QuestionAnsweringParameters": {
 			"title": "QuestionAnsweringParameters",
 			"description": "Additional inference parameters for Question Answering",
@@ -75,5 +61,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/sentence-similarity/inference.ts b/packages/tasks/src/tasks/sentence-similarity/inference.ts
index 252326caf..2b0df4011 100644
--- a/packages/tasks/src/tasks/sentence-similarity/inference.ts
+++ b/packages/tasks/src/tasks/sentence-similarity/inference.ts
@@ -1,32 +1,34 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 export type SentenceSimilarityOutput = number[];
 
 /**
  * Inputs for Sentence similarity inference
  */
 export interface SentenceSimilarityInput {
-	inputs: SentenceSimilarityInputSingle[] | SentenceSimilarityInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: unknown };
-	[property: string]: unknown;
+    input: InputObject;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
-export interface SentenceSimilarityInputSingle {
-	/**
-	 * A list of strings which will be compared against the source_sentence.
-	 */
-	sentences: string[];
-	/**
-	 * The string that you wish to compare the other strings with. This can be a phrase,
-	 * sentence, or longer passage, depending on the model being used.
-	 */
-	sourceSentence: string;
-	[property: string]: unknown;
+export interface InputObject {
+    /**
+     * A list of strings which will be compared against the source_sentence.
+     */
+    sentences: string[];
+    /**
+     * The string that you wish to compare the other strings with. This can be a phrase,
+     * sentence, or longer passage, depending on the model being used.
+     */
+    sourceSentence: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/sentence-similarity/spec/input.json b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
index cfb884abe..2e85543f8 100644
--- a/packages/tasks/src/tasks/sentence-similarity/spec/input.json
+++ b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
@@ -4,27 +4,8 @@
 	"description": "Inputs for Sentence similarity inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"anyOf": [
-				{
-					"$ref": "#/$defs/SentenceSimilarityInputSingle"
-				},
-				{
-					"type": "array",
-					"items": {
-						"$ref": "#/$defs/SentenceSimilarityInputSingle"
-					}
-				}
-			]
-		},
-		"parameters": {
-			"description": "Additional inference parameters",
-			"$ref": "#/$defs/SentenceSimilarityParameters"
-		}
-	},
-	"$defs": {
-		"SentenceSimilarityInputSingle": {
-			"title": "SentenceSimilarityInputSingle",
+		"input": {
+			"title": "SentenceSimilarityInput",
 			"type": "object",
 			"properties": {
 				"sourceSentence": {
@@ -41,6 +22,12 @@
 			},
 			"required": ["sourceSentence", "sentences"]
 		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/SentenceSimilarityParameters"
+		}
+	},
+	"$defs": {
 		"SentenceSimilarityParameters": {
 			"title": "SentenceSimilarityParameters",
 			"description": "Additional inference parameters for Sentence Similarity",
@@ -48,5 +35,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
index d38632cd4..9a063e93c 100644
--- a/packages/tasks/src/tasks/summarization/inference.ts
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -1,24 +1,26 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Summarization inference
  *
  * Inputs for Text2text Generation inference
  */
 export interface SummarizationInput {
-	/**
-	 * One or more texts to use for text2text generation
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: Text2TextGenerationParameters;
-	[property: string]: unknown;
+    /**
+     * The input text data
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: Text2TextGenerationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -27,19 +29,19 @@ export interface SummarizationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-	/**
-	 * Whether to clean up the potential extra spaces in the text output.
-	 */
-	cleanUpTokenizationSpaces?: boolean;
-	/**
-	 * Additional parametrization of the text generation algorithm
-	 */
-	parameters?: { [key: string]: unknown };
-	/**
-	 * The truncation strategy to use
-	 */
-	truncation?: Text2TextGenerationTruncationStrategy;
-	[property: string]: unknown;
+    /**
+     * Whether to clean up the potential extra spaces in the text output.
+     */
+    cleanUpTokenizationSpaces?: boolean;
+    /**
+     * Additional parametrization of the text generation algorithm
+     */
+    generateParameters?: { [key: string]: unknown };
+    /**
+     * The truncation strategy to use
+     */
+    truncation?: Text2TextGenerationTruncationStrategy;
+    [property: string]: unknown;
 }
 
 export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -50,9 +52,9 @@ export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest
  * Outputs of inference for the Text2text Generation task
  */
 export interface SummarizationOutput {
-	/**
-	 * The generated text.
-	 */
-	generatedText: string;
-	[property: string]: unknown;
+    /**
+     * The generated text.
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
index 35b172a5d..21cc519b0 100644
--- a/packages/tasks/src/tasks/table-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -1,59 +1,61 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Table Question Answering inference
  */
 export interface TableQuestionAnsweringInput {
-	/**
-	 * One or several questions about a table
-	 */
-	inputs: Inputs;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: unknown };
-	[property: string]: unknown;
+    /**
+     * One (table, question) pair to answer
+     */
+    input: Input;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
- * One or several questions about a table
+ * One (table, question) pair to answer
  */
-export interface Inputs {
-	/**
-	 * One or several questions to be answered about the table
-	 */
-	question?: string[] | string;
-	/**
-	 * The table to serve as context for the questions
-	 */
-	table?: { [key: string]: unknown };
-	[property: string]: unknown;
+export interface Input {
+    /**
+     * The question to be answered about the table
+     */
+    question: string;
+    /**
+     * The table to serve as context for the questions
+     */
+    table: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Table Question Answering task
  */
 export interface TableQuestionAnsweringOutput {
-	/**
-	 * If the model has an aggregator, this returns the aggregator.
-	 */
-	aggregator?: string;
-	/**
-	 * The answer of the question given the table. If there is an aggregator, the answer will be
-	 * preceded by `AGGREGATOR >`.
-	 */
-	answer: string;
-	/**
-	 * List of strings made up of the answer cell values.
-	 */
-	cells: string[];
-	/**
-	 * Coordinates of the cells of the answers.
-	 */
-	coordinates: Array<number[]>;
-	[property: string]: unknown;
+    /**
+     * If the model has an aggregator, this returns the aggregator.
+     */
+    aggregator?: string;
+    /**
+     * The answer of the question given the table. If there is an aggregator, the answer will be
+     * preceded by `AGGREGATOR >`.
+     */
+    answer: string;
+    /**
+     * List of strings made up of the answer cell values.
+     */
+    cells: string[];
+    /**
+     * Coordinates of the cells of the answers.
+     */
+    coordinates: Array<number[]>;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/input.json b/packages/tasks/src/tasks/table-question-answering/spec/input.json
index aa7c7231f..3ceb5c07a 100644
--- a/packages/tasks/src/tasks/table-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/table-question-answering/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Table Question Answering inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several questions about a table",
+		"input": {
+			"description": "One (table, question) pair to answer",
 			"type": "object",
 			"properties": {
 				"table": {
@@ -13,20 +13,11 @@
 					"type": "object"
 				},
 				"question": {
-					"description": "One or several questions to be answered about the table",
-					"anyOf": [
-						{
-							"type": "string"
-						},
-						{
-							"type": "array",
-							"items": {
-								"type": "string"
-							}
-						}
-					]
+					"description": "The question to be answered about the table",
+					"type": "string"
 				}
-			}
+			},
+			"required": ["table", "question"]
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -41,5 +32,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
index 58a54af0c..6e09d5c7c 100644
--- a/packages/tasks/src/tasks/text-classification/inference.ts
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text Classification inference
  */
 export interface TextClassificationInput {
-	/**
-	 * One or several texts to classify
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: TextClassificationParameters;
-	[property: string]: unknown;
+    /**
+     * The text to classify
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TextClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,15 +27,15 @@ export interface TextClassificationInput {
  * Additional inference parameters for Text Classification
  */
 export interface TextClassificationParameters {
-	/**
-	 * The function to apply to the model outputs in order to retrieve the scores.
-	 */
-	functionToApply?: TextClassificationOutputTransform;
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: unknown;
+    /**
+     * The function to apply to the model outputs in order to retrieve the scores.
+     */
+    functionToApply?: TextClassificationOutputTransform;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
@@ -42,13 +44,13 @@ export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
  * Outputs of inference for the Text Classification task
  */
 export interface TextClassificationOutput {
-	/**
-	 * The predicted class label (model specific).
-	 */
-	label: string;
-	/**
-	 * The corresponding probability.
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
index 73b14c794..85c8468ad 100644
--- a/packages/tasks/src/tasks/text-classification/spec/input.json
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -4,19 +4,9 @@
 	"description": "Inputs for Text Classification inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several texts to classify",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
+		"input": {
+			"description": "The text to classify",
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -52,5 +42,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index 62af0a9c5..cd83abbb2 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text Generation inference
  */
 export interface TextGenerationInput {
-	/**
-	 * The text to initialize generation with
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: TextGenerationParameters;
-	[property: string]: unknown;
+    /**
+     * The text to initialize generation with
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TextGenerationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,63 +27,63 @@ export interface TextGenerationInput {
  * Additional inference parameters for Text Generation
  */
 export interface TextGenerationParameters {
-	/**
-	 * Whether to use logit sampling (true) or greedy search (false).
-	 */
-	doSample?: boolean;
-	/**
-	 * Maximum number of generated tokens.
-	 */
-	maxNewTokens?: number;
-	/**
-	 * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
-	 * paper](https://hf.co/papers/1909.05858) for more details.
-	 */
-	repetitionPenalty?: number;
-	/**
-	 * Whether to prepend the prompt to the generated text.
-	 */
-	returnFullText?: boolean;
-	/**
-	 * Stop generating tokens if a member of `stop_sequences` is generated.
-	 */
-	stopSequences?: string[];
-	/**
-	 * The value used to modulate the logits distribution.
-	 */
-	temperature?: number;
-	/**
-	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
-	 */
-	topK?: number;
-	/**
-	 * If set to < 1, only the smallest set of most probable tokens with probabilities that add
-	 * up to `top_p` or higher are kept for generation.
-	 */
-	topP?: number;
-	/**
-	 * Truncate input tokens to the given size.
-	 */
-	truncate?: number;
-	/**
-	 * Typical Decoding mass. See [Typical Decoding for Natural Language
-	 * Generation](https://hf.co/papers/2202.00666) for more information
-	 */
-	typicalP?: number;
-	/**
-	 * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
-	 */
-	watermark?: boolean;
-	[property: string]: unknown;
+    /**
+     * Whether to use logit sampling (true) or greedy search (false).
+     */
+    doSample?: boolean;
+    /**
+     * Maximum number of generated tokens.
+     */
+    maxNewTokens?: number;
+    /**
+     * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
+     * paper](https://hf.co/papers/1909.05858) for more details.
+     */
+    repetitionPenalty?: number;
+    /**
+     * Whether to prepend the prompt to the generated text.
+     */
+    returnFullText?: boolean;
+    /**
+     * Stop generating tokens if a member of `stop_sequences` is generated.
+     */
+    stopSequences?: string[];
+    /**
+     * The value used to modulate the logits distribution.
+     */
+    temperature?: number;
+    /**
+     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+     */
+    topK?: number;
+    /**
+     * If set to < 1, only the smallest set of most probable tokens with probabilities that add
+     * up to `top_p` or higher are kept for generation.
+     */
+    topP?: number;
+    /**
+     * Truncate input tokens to the given size.
+     */
+    truncate?: number;
+    /**
+     * Typical Decoding mass. See [Typical Decoding for Natural Language
+     * Generation](https://hf.co/papers/2202.00666) for more information
+     */
+    typicalP?: number;
+    /**
+     * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
+     */
+    watermark?: boolean;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs for Text Generation inference
  */
 export interface TextGenerationOutput {
-	/**
-	 * The generated text
-	 */
-	generatedText: string;
-	[property: string]: unknown;
+    /**
+     * The generated text
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-generation/spec/input.json b/packages/tasks/src/tasks/text-generation/spec/input.json
index 9b5d3d08e..b1cf45995 100644
--- a/packages/tasks/src/tasks/text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text-generation/spec/input.json
@@ -4,19 +4,9 @@
 	"description": "Inputs for Text Generation inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
+		"input": {
 			"description": "The text to initialize generation with",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -79,5 +69,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
index 3b23948a3..71e745c21 100644
--- a/packages/tasks/src/tasks/text-to-audio/inference.ts
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -1,35 +1,37 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text To Audio inference
  */
 export interface TextToAudioInput {
-	/**
-	 * One or several texts to generate audio for
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: unknown };
-	[property: string]: unknown;
+    /**
+     * The input text data
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToAudioOutput {
-	/**
-	 * The generated audio waveform.
-	 */
-	audio: unknown;
-	/**
-	 * The sampling rate of the generated audio waveform.
-	 */
-	samplingRate: number;
-	[property: string]: unknown;
+    /**
+     * The generated audio waveform.
+     */
+    audio: unknown;
+    /**
+     * The sampling rate of the generated audio waveform.
+     */
+    samplingRate: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/input.json b/packages/tasks/src/tasks/text-to-audio/spec/input.json
index 96febb6fc..a0802c102 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/input.json
@@ -4,19 +4,9 @@
 	"description": "Inputs for Text To Audio inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several texts to generate audio for",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
+		"input": {
+			"description": "The input text data",
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -31,5 +21,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index 766b23a38..be5c01981 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -1,24 +1,26 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text to Speech inference
  *
  * Inputs for Text To Audio inference
  */
 export interface TextToSpeechInput {
-	/**
-	 * One or several texts to generate audio for
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: unknown };
-	[property: string]: unknown;
+    /**
+     * The input text data
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
@@ -27,13 +29,13 @@ export interface TextToSpeechInput {
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToSpeechOutput {
-	/**
-	 * The generated audio waveform.
-	 */
-	audio: unknown;
-	/**
-	 * The sampling rate of the generated audio waveform.
-	 */
-	samplingRate: number;
-	[property: string]: unknown;
+    /**
+     * The generated audio waveform.
+     */
+    audio: unknown;
+    /**
+     * The sampling rate of the generated audio waveform.
+     */
+    samplingRate: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index 7d2c7182e..ce6623422 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text2text Generation inference
  */
 export interface Text2TextGenerationInput {
-	/**
-	 * One or more texts to use for text2text generation
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: Text2TextGenerationParameters;
-	[property: string]: unknown;
+    /**
+     * The input text data
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: Text2TextGenerationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,19 +27,19 @@ export interface Text2TextGenerationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-	/**
-	 * Whether to clean up the potential extra spaces in the text output.
-	 */
-	cleanUpTokenizationSpaces?: boolean;
-	/**
-	 * Additional parametrization of the text generation algorithm
-	 */
-	parameters?: { [key: string]: unknown };
-	/**
-	 * The truncation strategy to use
-	 */
-	truncation?: Text2TextGenerationTruncationStrategy;
-	[property: string]: unknown;
+    /**
+     * Whether to clean up the potential extra spaces in the text output.
+     */
+    cleanUpTokenizationSpaces?: boolean;
+    /**
+     * Additional parametrization of the text generation algorithm
+     */
+    generateParameters?: { [key: string]: unknown };
+    /**
+     * The truncation strategy to use
+     */
+    truncation?: Text2TextGenerationTruncationStrategy;
+    [property: string]: unknown;
 }
 
 export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -46,9 +48,9 @@ export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest
  * Outputs of inference for the Text2text Generation task
  */
 export interface Text2TextGenerationOutput {
-	/**
-	 * The generated text.
-	 */
-	generatedText: string;
-	[property: string]: unknown;
+    /**
+     * The generated text.
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/input.json b/packages/tasks/src/tasks/text2text-generation/spec/input.json
index e8a0b9cd0..495b5a281 100644
--- a/packages/tasks/src/tasks/text2text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text2text-generation/spec/input.json
@@ -4,19 +4,9 @@
 	"description": "Inputs for Text2text Generation inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or more texts to use for text2text generation",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
+		"input": {
+			"description": "The input text data",
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -52,7 +42,7 @@
 						}
 					]
 				},
-				"Parameters": {
+				"generateParameters": {
 					"title": "generateParameters",
 					"type": "object",
 					"description": "Additional parametrization of the text generation algorithm"
@@ -60,5 +50,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
index fa18ba34b..629c47c47 100644
--- a/packages/tasks/src/tasks/token-classification/inference.ts
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Token Classification inference
  */
 export interface TokenClassificationInput {
-	/**
-	 * One or several texts which tokens are to be classified
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: TokenClassificationParameters;
-	[property: string]: unknown;
+    /**
+     * The input text data
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TokenClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,19 +27,19 @@ export interface TokenClassificationInput {
  * Additional inference parameters for Token Classification
  */
 export interface TokenClassificationParameters {
-	/**
-	 * The strategy used to fuse tokens based on model predictions
-	 */
-	aggregationStrategy?: TokenClassificationAggregationStrategy;
-	/**
-	 * A list of labels to ignore
-	 */
-	ignoreLabels?: string[];
-	/**
-	 * The number of overlapping tokens between chunks when splitting the input text.
-	 */
-	stride?: number;
-	[property: string]: unknown;
+    /**
+     * The strategy used to fuse tokens based on model predictions
+     */
+    aggregationStrategy?: TokenClassificationAggregationStrategy;
+    /**
+     * A list of labels to ignore
+     */
+    ignoreLabels?: string[];
+    /**
+     * The number of overlapping tokens between chunks when splitting the input text.
+     */
+    stride?: number;
+    [property: string]: unknown;
 }
 
 /**
@@ -60,26 +62,26 @@ export type TokenClassificationAggregationStrategy = "none" | "simple" | "first"
  * Outputs of inference for the Token Classification task
  */
 export interface TokenClassificationOutput {
-	/**
-	 * The character position in the input where this group ends.
-	 */
-	end?: number;
-	/**
-	 * The predicted label for that group of tokens
-	 */
-	entityGroup?: string;
-	label: unknown;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	/**
-	 * The character position in the input where this group begins.
-	 */
-	start?: number;
-	/**
-	 * The corresponding text
-	 */
-	word?: string;
-	[property: string]: unknown;
+    /**
+     * The character position in the input where this group ends.
+     */
+    end?: number;
+    /**
+     * The predicted label for that group of tokens
+     */
+    entityGroup?: string;
+    label:        unknown;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    /**
+     * The character position in the input where this group begins.
+     */
+    start?: number;
+    /**
+     * The corresponding text
+     */
+    word?: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/token-classification/spec/input.json b/packages/tasks/src/tasks/token-classification/spec/input.json
index a2fcf5fdf..f46b20cf9 100644
--- a/packages/tasks/src/tasks/token-classification/spec/input.json
+++ b/packages/tasks/src/tasks/token-classification/spec/input.json
@@ -4,19 +4,9 @@
 	"description": "Inputs for Token Classification inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several texts which tokens are to be classified",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
+		"input": {
+			"description": "The input text data",
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -70,5 +60,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
index ecb108287..96090808b 100644
--- a/packages/tasks/src/tasks/translation/inference.ts
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -1,24 +1,26 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Translation inference
  *
  * Inputs for Text2text Generation inference
  */
 export interface TranslationInput {
-	/**
-	 * One or more texts to use for text2text generation
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: Text2TextGenerationParameters;
-	[property: string]: unknown;
+    /**
+     * The input text data
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: Text2TextGenerationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -27,19 +29,19 @@ export interface TranslationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-	/**
-	 * Whether to clean up the potential extra spaces in the text output.
-	 */
-	cleanUpTokenizationSpaces?: boolean;
-	/**
-	 * Additional parametrization of the text generation algorithm
-	 */
-	parameters?: { [key: string]: unknown };
-	/**
-	 * The truncation strategy to use
-	 */
-	truncation?: Text2TextGenerationTruncationStrategy;
-	[property: string]: unknown;
+    /**
+     * Whether to clean up the potential extra spaces in the text output.
+     */
+    cleanUpTokenizationSpaces?: boolean;
+    /**
+     * Additional parametrization of the text generation algorithm
+     */
+    generateParameters?: { [key: string]: unknown };
+    /**
+     * The truncation strategy to use
+     */
+    truncation?: Text2TextGenerationTruncationStrategy;
+    [property: string]: unknown;
 }
 
 export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -50,9 +52,9 @@ export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest
  * Outputs of inference for the Text2text Generation task
  */
 export interface TranslationOutput {
-	/**
-	 * The generated text.
-	 */
-	generatedText: string;
-	[property: string]: unknown;
+    /**
+     * The generated text.
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
index 29b3fed09..2d258b33b 100644
--- a/packages/tasks/src/tasks/video-classification/inference.ts
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Video Classification inference
  */
 export interface VideoClassificationInput {
-	/**
-	 * One or several videos to be classified
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: VideoClassificationParameters;
-	[property: string]: unknown;
+    /**
+     * One or several videos to be classified
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: VideoClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,32 +27,38 @@ export interface VideoClassificationInput {
  * Additional inference parameters for Video Classification
  */
 export interface VideoClassificationParameters {
-	/**
-	 * The sampling rate used to select frames from the video.
-	 */
-	frameSamplingRate?: number;
-	/**
-	 * The number of sampled frames to consider for classification.
-	 */
-	numFrames?: number;
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: unknown;
+    /**
+     * The sampling rate used to select frames from the video.
+     */
+    frameSamplingRate?: number;
+    /**
+     * The function to apply to the model outputs in order to retrieve the scores.
+     */
+    functionToApply?: TextClassificationOutputTransform;
+    /**
+     * The number of sampled frames to consider for classification.
+     */
+    numFrames?: number;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
+export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+
 /**
  * Outputs of inference for the Video Classification task
  */
 export interface VideoClassificationOutput {
-	/**
-	 * The predicted class label (model specific).
-	 */
-	label: string;
-	/**
-	 * The corresponding probability.
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/video-classification/spec/input.json b/packages/tasks/src/tasks/video-classification/spec/input.json
index 91b9f9642..796ce393f 100644
--- a/packages/tasks/src/tasks/video-classification/spec/input.json
+++ b/packages/tasks/src/tasks/video-classification/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Video Classification inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
+		"input": {
 			"description": "One or several videos to be classified"
 		},
 		"parameters": {
@@ -18,6 +18,22 @@
 			"description": "Additional inference parameters for Video Classification",
 			"type": "object",
 			"properties": {
+				"functionToApply": {
+					"title": "TextClassificationOutputTransform",
+					"type": "string",
+					"description": "The function to apply to the model outputs in order to retrieve the scores.",
+					"oneOf": [
+						{
+							"const": "sigmoid"
+						},
+						{
+							"const": "softmax"
+						},
+						{
+							"const": "none"
+						}
+					]
+				},
 				"numFrames": {
 					"type": "integer",
 					"description": "The number of sampled frames to consider for classification."
@@ -33,5 +49,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
index 3b9070447..7d192a33d 100644
--- a/packages/tasks/src/tasks/visual-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -1,34 +1,39 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Visual Question Answering inference
  */
 export interface VisualQuestionAnsweringInput {
-	/**
-	 * One or more image-question pairs
-	 */
-	inputs: VisualQuestionAnsweringInputSingle[] | VisualQuestionAnsweringInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: VisualQuestionAnsweringParameters;
-	[property: string]: unknown;
+    /**
+     * One (image, question) pair to answer
+     */
+    input: VisualQuestionAnsweringInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: VisualQuestionAnsweringParameters;
+    [property: string]: unknown;
 }
 
+/**
+ * One (image, question) pair to answer
+ */
 export interface VisualQuestionAnsweringInputSingle {
-	/**
-	 * The image.
-	 */
-	image: unknown;
-	/**
-	 * The question to answer based on the image.
-	 */
-	question: unknown;
-	[property: string]: unknown;
+    /**
+     * The image.
+     */
+    image: unknown;
+    /**
+     * The question to answer based on the image.
+     */
+    question: unknown;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,27 +42,27 @@ export interface VisualQuestionAnsweringInputSingle {
  * Additional inference parameters for Visual Question Answering
  */
 export interface VisualQuestionAnsweringParameters {
-	/**
-	 * The number of answers to return (will be chosen by order of likelihood). Note that we
-	 * return less than topk answers if there are not enough options available within the
-	 * context.
-	 */
-	topK?: number;
-	[property: string]: unknown;
+    /**
+     * The number of answers to return (will be chosen by order of likelihood). Note that we
+     * return less than topk answers if there are not enough options available within the
+     * context.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Visual Question Answering task
  */
 export interface VisualQuestionAnsweringOutput {
-	/**
-	 * The answer to the question
-	 */
-	answer?: string;
-	label: unknown;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * The answer to the question
+     */
+    answer?: string;
+    label:   unknown;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/input.json b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
index cc6e5d93a..2e77422d9 100644
--- a/packages/tasks/src/tasks/visual-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
@@ -4,27 +4,8 @@
 	"description": "Inputs for Visual Question Answering inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or more image-question pairs",
-			"anyOf": [
-				{
-					"$ref": "#/$defs/VisualQuestionAnsweringInputSingle"
-				},
-				{
-					"type": "array",
-					"items": {
-						"$ref": "#/$defs/VisualQuestionAnsweringInputSingle"
-					}
-				}
-			]
-		},
-		"parameters": {
-			"description": "Additional inference parameters",
-			"$ref": "#/$defs/VisualQuestionAnsweringParameters"
-		}
-	},
-	"$defs": {
-		"VisualQuestionAnsweringInputSingle": {
+		"input": {
+			"description": "One (image, question) pair to answer",
 			"type": "object",
 			"title": "VisualQuestionAnsweringInputSingle",
 			"properties": {
@@ -37,6 +18,12 @@
 			},
 			"required": ["question", "image"]
 		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/VisualQuestionAnsweringParameters"
+		}
+	},
+	"$defs": {
 		"VisualQuestionAnsweringParameters": {
 			"title": "VisualQuestionAnsweringParameters",
 			"description": "Additional inference parameters for Visual Question Answering",
@@ -49,5 +36,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
index 564c6ba62..578f24946 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -1,34 +1,39 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Zero Shot Classification inference
  */
 export interface ZeroShotClassificationInput {
-	/**
-	 * One or several text + candidate labels pairs to classify
-	 */
-	inputs: ZeroShotClassificationInputSingle[] | ZeroShotClassificationInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ZeroShotClassificationParameters;
-	[property: string]: unknown;
+    /**
+     * The input text data, with candidate labels
+     */
+    input: InputObject;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ZeroShotClassificationParameters;
+    [property: string]: unknown;
 }
 
-export interface ZeroShotClassificationInputSingle {
-	/**
-	 * The set of possible class labels to classify the text into.
-	 */
-	candidateLabels: string[];
-	/**
-	 * The text to classify
-	 */
-	text: string;
-	[property: string]: unknown;
+/**
+ * The input text data, with candidate labels
+ */
+export interface InputObject {
+    /**
+     * The set of possible class labels to classify the text into.
+     */
+    candidateLabels: string[];
+    /**
+     * The text to classify
+     */
+    text: string;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,31 +42,31 @@ export interface ZeroShotClassificationInputSingle {
  * Additional inference parameters for Zero Shot Classification
  */
 export interface ZeroShotClassificationParameters {
-	/**
-	 * The sentence used in conjunction with candidateLabels to attempt the text classification
-	 * by replacing the placeholder with the candidate labels.
-	 */
-	hypothesisTemplate?: string;
-	/**
-	 * Whether multiple candidate labels can be true. If false, the scores are normalized such
-	 * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
-	 * considered independent and probabilities are normalized for each candidate.
-	 */
-	multiLabel?: boolean;
-	[property: string]: unknown;
+    /**
+     * The sentence used in conjunction with candidateLabels to attempt the text classification
+     * by replacing the placeholder with the candidate labels.
+     */
+    hypothesisTemplate?: string;
+    /**
+     * Whether multiple candidate labels can be true. If false, the scores are normalized such
+     * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
+     * considered independent and probabilities are normalized for each candidate.
+     */
+    multiLabel?: boolean;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Classification task
  */
 export interface ZeroShotClassificationOutput {
-	/**
-	 * A candidate label
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * A candidate label
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
index e573f6817..ce10d0b61 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
@@ -4,29 +4,10 @@
 	"description": "Inputs for Zero Shot Classification inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several text + candidate labels pairs to classify",
-			"anyOf": [
-				{
-					"$ref": "#/$defs/ZeroShotClassificationInputSingle"
-				},
-				{
-					"type": "array",
-					"items": {
-						"$ref": "#/$defs/ZeroShotClassificationInputSingle"
-					}
-				}
-			]
-		},
-		"parameters": {
-			"description": "Additional inference parameters",
-			"$ref": "#/$defs/ZeroShotClassificationParameters"
-		}
-	},
-	"$defs": {
-		"ZeroShotClassificationInputSingle": {
+		"input": {
+			"description": "The input text data, with candidate labels",
 			"type": "object",
-			"title": "ZeroShotClassificationInputSingle",
+			"title": "ZeroShotClassificationInput",
 			"properties": {
 				"text": {
 					"type": "string",
@@ -42,6 +23,12 @@
 			},
 			"required": ["text", "candidateLabels"]
 		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/ZeroShotClassificationParameters"
+		}
+	},
+	"$defs": {
 		"ZeroShotClassificationParameters": {
 			"title": "ZeroShotClassificationParameters",
 			"description": "Additional inference parameters for Zero Shot Classification",
@@ -58,5 +45,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 0976094a4..970d7708b 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -1,34 +1,39 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Zero Shot Image Classification inference
  */
 export interface ZeroShotImageClassificationInput {
-	/**
-	 * One or several images to classify
-	 */
-	inputs: ZeroShotImageClassificationInputSingle[] | ZeroShotImageClassificationInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ZeroShotImageClassificationParameters;
-	[property: string]: unknown;
+    /**
+     * The input image data, with candidate labels
+     */
+    input: ZeroShotImageClassificationInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ZeroShotImageClassificationParameters;
+    [property: string]: unknown;
 }
 
+/**
+ * The input image data, with candidate labels
+ */
 export interface ZeroShotImageClassificationInputSingle {
-	/**
-	 * The candidate labels for this image
-	 */
-	candidateLabels: string[];
-	/**
-	 * The image data to classify
-	 */
-	image: unknown;
-	[property: string]: unknown;
+    /**
+     * The candidate labels for this image
+     */
+    candidateLabels: string[];
+    /**
+     * The image data to classify
+     */
+    image: unknown;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,25 +42,25 @@ export interface ZeroShotImageClassificationInputSingle {
  * Additional inference parameters for Zero Shot Image Classification
  */
 export interface ZeroShotImageClassificationParameters {
-	/**
-	 * The sentence used in conjunction with candidateLabels to attempt the text classification
-	 * by replacing the placeholder with the candidate labels.
-	 */
-	hypothesisTemplate?: string;
-	[property: string]: unknown;
+    /**
+     * The sentence used in conjunction with candidateLabels to attempt the text classification
+     * by replacing the placeholder with the candidate labels.
+     */
+    hypothesisTemplate?: string;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Image Classification task
  */
 export interface ZeroShotImageClassificationOutput {
-	/**
-	 * A candidate label
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * A candidate label
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
index 029b19b2d..07fffa5ef 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
@@ -4,27 +4,8 @@
 	"description": "Inputs for Zero Shot Image Classification inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several images to classify",
-			"anyOf": [
-				{
-					"$ref": "#/$defs/ZeroShotImageClassificationInputSingle"
-				},
-				{
-					"type": "array",
-					"items": {
-						"$ref": "#/$defs/ZeroShotImageClassificationInputSingle"
-					}
-				}
-			]
-		},
-		"parameters": {
-			"description": "Additional inference parameters",
-			"$ref": "#/$defs/ZeroShotImageClassificationParameters"
-		}
-	},
-	"$defs": {
-		"ZeroShotImageClassificationInputSingle": {
+		"input": {
+			"description": "The input image data, with candidate labels",
 			"type": "object",
 			"title": "ZeroShotImageClassificationInputSingle",
 			"properties": {
@@ -41,6 +22,12 @@
 			},
 			"required": ["image", "candidateLabels"]
 		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/ZeroShotImageClassificationParameters"
+		}
+	},
+	"$defs": {
 		"ZeroShotImageClassificationParameters": {
 			"title": "ZeroShotImageClassificationParameters",
 			"description": "Additional inference parameters for Zero Shot Image Classification",
@@ -53,5 +40,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index 6493541d8..2e3e12f74 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -1,54 +1,59 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Zero Shot Object Detection inference
  */
 export interface ZeroShotObjectDetectionInput {
-	/**
-	 * One or several images to perform object detection on
-	 */
-	inputs: ZeroShotObjectDetectionInputSingle[] | ZeroShotObjectDetectionInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: unknown };
-	[property: string]: unknown;
+    /**
+     * The input image data, with candidate labels
+     */
+    input: ZeroShotObjectDetectionInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
+/**
+ * The input image data, with candidate labels
+ */
 export interface ZeroShotObjectDetectionInputSingle {
-	/**
-	 * The candidate labels for this image
-	 */
-	candidateLabels: string[];
-	/**
-	 * The image data to generate bounding boxes from
-	 */
-	image: unknown;
-	[property: string]: unknown;
+    /**
+     * The candidate labels for this image
+     */
+    candidateLabels: string[];
+    /**
+     * The image data to generate bounding boxes from
+     */
+    image: unknown;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Object Detection task
  */
 export interface ZeroShotObjectDetectionOutput {
-	/**
-	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
-	 * image.
-	 */
-	box: BoundingBox;
-	/**
-	 * A candidate label
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * The predicted bounding box. Coordinates are relative to the top left corner of the input
+     * image.
+     */
+    box: BoundingBox;
+    /**
+     * A candidate label
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
 
 /**
@@ -56,9 +61,9 @@ export interface ZeroShotObjectDetectionOutput {
  * image.
  */
 export interface BoundingBox {
-	xmax: number;
-	xmin: number;
-	ymax: number;
-	ymin: number;
-	[property: string]: unknown;
+    xmax: number;
+    xmin: number;
+    ymax: number;
+    ymin: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
index f2929226b..7f72f3f9b 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
@@ -4,27 +4,8 @@
 	"description": "Inputs for Zero Shot Object Detection inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several images to perform object detection on",
-			"anyOf": [
-				{
-					"$ref": "#/$defs/ZeroShotObjectDetectionInputSingle"
-				},
-				{
-					"type": "array",
-					"items": {
-						"$ref": "#/$defs/ZeroShotObjectDetectionInputSingle"
-					}
-				}
-			]
-		},
-		"parameters": {
-			"description": "Additional inference parameters",
-			"$ref": "#/$defs/ZeroShotObjectDetectionParameters"
-		}
-	},
-	"$defs": {
-		"ZeroShotObjectDetectionInputSingle": {
+		"input": {
+			"description": "The input image data, with candidate labels",
 			"type": "object",
 			"title": "ZeroShotObjectDetectionInputSingle",
 			"properties": {
@@ -41,6 +22,12 @@
 			},
 			"required": ["image", "candidateLabels"]
 		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/ZeroShotObjectDetectionParameters"
+		}
+	},
+	"$defs": {
 		"ZeroShotObjectDetectionParameters": {
 			"title": "ZeroShotObjectDetectionParameters",
 			"description": "Additional inference parameters for Zero Shot Object Detection",
@@ -48,5 +35,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }

From 29f5975a715f519c50c42473f1c2fbeea212f9da Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Thu, 25 Jan 2024 11:35:25 +0100
Subject: [PATCH 31/51] rename input -> data

---
 .../tasks/audio-classification/inference.ts   |  58 ++++---
 .../audio-classification/spec/input.json      |   4 +-
 .../automatic-speech-recognition/inference.ts |  32 ++--
 .../spec/input.json                           |   4 +-
 .../src/tasks/depth-estimation/inference.ts   |  32 ++--
 .../tasks/depth-estimation/spec/input.json    |   4 +-
 .../document-question-answering/inference.ts  | 152 +++++++++---------
 .../spec/input.json                           |  31 ++--
 .../src/tasks/feature-extraction/inference.ts |  22 ++-
 .../tasks/feature-extraction/spec/input.json  |   4 +-
 .../tasks/src/tasks/fill-mask/inference.ts    |  80 +++++----
 .../tasks/src/tasks/fill-mask/spec/input.json |   4 +-
 .../tasks/image-classification/inference.ts   |  58 ++++---
 .../image-classification/spec/input.json      |   4 +-
 .../src/tasks/image-segmentation/inference.ts |  82 +++++-----
 .../tasks/image-segmentation/spec/input.json  |   4 +-
 .../src/tasks/image-to-image/inference.ts     |  22 ++-
 .../src/tasks/image-to-image/spec/input.json  |   4 +-
 .../src/tasks/image-to-text/inference.ts      |  42 +++--
 .../src/tasks/image-to-text/spec/input.json   |   4 +-
 .../src/tasks/object-detection/inference.ts   |  70 ++++----
 .../tasks/object-detection/spec/input.json    |   4 +-
 .../src/tasks/placeholder/spec/input.json     |  16 +-
 .../src/tasks/question-answering/inference.ts | 148 +++++++++--------
 .../tasks/question-answering/spec/input.json  |   5 +-
 .../tasks/sentence-similarity/inference.ts    |  38 +++--
 .../tasks/sentence-similarity/spec/input.json |   6 +-
 .../sentence-similarity/spec/output.json      |   3 +-
 .../src/tasks/summarization/inference.ts      |  58 ++++---
 .../table-question-answering/inference.ts     |  78 +++++----
 .../table-question-answering/spec/input.json  |   5 +-
 .../tasks/text-classification/inference.ts    |  58 ++++---
 .../tasks/text-classification/spec/input.json |   4 +-
 .../src/tasks/text-generation/inference.ts    | 128 ++++++++-------
 .../src/tasks/text-generation/spec/input.json |   4 +-
 .../src/tasks/text-to-audio/inference.ts      |  40 +++--
 .../src/tasks/text-to-audio/spec/input.json   |   4 +-
 .../src/tasks/text-to-speech/inference.ts     |  40 +++--
 .../tasks/text2text-generation/inference.ts   |  58 ++++---
 .../text2text-generation/spec/input.json      |   4 +-
 .../tasks/token-classification/inference.ts   |  92 ++++++-----
 .../token-classification/spec/input.json      |   4 +-
 .../tasks/src/tasks/translation/inference.ts  |  58 ++++---
 .../tasks/video-classification/inference.ts   |  74 +++++----
 .../video-classification/spec/input.json      |   6 +-
 .../visual-question-answering/inference.ts    |  76 +++++----
 .../visual-question-answering/spec/input.json |   6 +-
 .../zero-shot-classification/inference.ts     |  84 +++++-----
 .../zero-shot-classification/spec/input.json  |   6 +-
 .../inference.ts                              |  72 ++++-----
 .../spec/input.json                           |   6 +-
 .../zero-shot-object-detection/inference.ts   |  80 +++++----
 .../spec/input.json                           |   6 +-
 53 files changed, 962 insertions(+), 1026 deletions(-)

diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index 1ba43fae1..bfc7af54e 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Audio Classification inference
  */
 export interface AudioClassificationInput {
-    /**
-     * The input audio data
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: AudioClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input audio data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: AudioClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,15 +25,15 @@ export interface AudioClassificationInput {
  * Additional inference parameters for Audio Classification
  */
 export interface AudioClassificationParameters {
-    /**
-     * The function to apply to the model outputs in order to retrieve the scores.
-     */
-    functionToApply?: AudioClassificationOutputTransform;
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * The function to apply to the model outputs in order to retrieve the scores.
+	 */
+	functionToApply?: AudioClassificationOutputTransform;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 export type AudioClassificationOutputTransform = "sigmoid" | "softmax" | "none";
@@ -44,13 +42,13 @@ export type AudioClassificationOutputTransform = "sigmoid" | "softmax" | "none";
  * Outputs for Audio Classification inference
  */
 export interface AudioClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index 60062756c..d6cc4516c 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Audio Classification inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input audio data"
 		},
 		"parameters": {
@@ -41,5 +41,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index 7ddfe0055..bf594e048 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -1,33 +1,31 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Automatic Speech Recognition inference
  */
 export interface AutomaticSpeechRecognitionInput {
-    /**
-     * The input audio data
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * The input audio data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Automatic Speech Recognition task
  */
 export interface AutomaticSpeechRecognitionOutput {
-    /**
-     * The recognized text.
-     */
-    text: string;
-    [property: string]: unknown;
+	/**
+	 * The recognized text.
+	 */
+	text: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
index 147851e57..be2471966 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Automatic Speech Recognition inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input audio data"
 		},
 		"parameters": {
@@ -20,5 +20,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/depth-estimation/inference.ts b/packages/tasks/src/tasks/depth-estimation/inference.ts
index 9019043eb..ca831fdb4 100644
--- a/packages/tasks/src/tasks/depth-estimation/inference.ts
+++ b/packages/tasks/src/tasks/depth-estimation/inference.ts
@@ -1,26 +1,24 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 export type DepthEstimationOutput = unknown[];
 
 /**
  * Inputs for Depth Estimation inference
  */
 export interface DepthEstimationInput {
-    /**
-     * The input image data
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: DepthEstimationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: DepthEstimationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -29,9 +27,9 @@ export interface DepthEstimationInput {
  * Additional inference parameters for Depth Estimation
  */
 export interface DepthEstimationParameters {
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/input.json b/packages/tasks/src/tasks/depth-estimation/spec/input.json
index 80f491974..e5553f126 100644
--- a/packages/tasks/src/tasks/depth-estimation/spec/input.json
+++ b/packages/tasks/src/tasks/depth-estimation/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Depth Estimation inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input image data"
 		},
 		"parameters": {
@@ -25,5 +25,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index a04b3b39b..73eb58f1c 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -1,39 +1,37 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Document Question Answering inference
  */
 export interface DocumentQuestionAnsweringInput {
-    /**
-     * One (document, question) pair to answer
-     */
-    input: DocumentQuestionAnsweringInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: DocumentQuestionAnsweringParameters;
-    [property: string]: unknown;
+	/**
+	 * One (document, question) pair to answer
+	 */
+	data: DocumentQuestionAnsweringInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: DocumentQuestionAnsweringParameters;
+	[property: string]: unknown;
 }
 
 /**
  * One (document, question) pair to answer
  */
-export interface DocumentQuestionAnsweringInputSingle {
-    /**
-     * The image on which the question is asked
-     */
-    image: unknown;
-    /**
-     * A question to ask of the document
-     */
-    question: string;
-    [property: string]: unknown;
+export interface DocumentQuestionAnsweringInputData {
+	/**
+	 * The image on which the question is asked
+	 */
+	image: unknown;
+	/**
+	 * A question to ask of the document
+	 */
+	question: string;
+	[property: string]: unknown;
 }
 
 /**
@@ -42,65 +40,65 @@ export interface DocumentQuestionAnsweringInputSingle {
  * Additional inference parameters for Document Question Answering
  */
 export interface DocumentQuestionAnsweringParameters {
-    /**
-     * If the words in the document are too long to fit with the question for the model, it will
-     * be split in several chunks with some overlap. This argument controls the size of that
-     * overlap.
-     */
-    docStride?: number;
-    /**
-     * Whether to accept impossible as an answer
-     */
-    handleImpossibleAnswer?: boolean;
-    /**
-     * Language to use while running OCR. Defaults to english.
-     */
-    lang?: string;
-    /**
-     * The maximum length of predicted answers (e.g., only answers with a shorter length are
-     * considered).
-     */
-    maxAnswerLen?: number;
-    /**
-     * The maximum length of the question after tokenization. It will be truncated if needed.
-     */
-    maxQuestionLen?: number;
-    /**
-     * The maximum length of the total sentence (context + question) in tokens of each chunk
-     * passed to the model. The context will be split in several chunks (using doc_stride as
-     * overlap) if needed.
-     */
-    maxSeqLen?: number;
-    /**
-     * The number of answers to return (will be chosen by order of likelihood). Can return less
-     * than top_k answers if there are not enough options available within the context.
-     */
-    topK?: number;
-    /**
-     * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
-     * skip the OCR step and use the provided bounding boxes instead.
-     */
-    wordBoxes?: Array<number[] | string>;
-    [property: string]: unknown;
+	/**
+	 * If the words in the document are too long to fit with the question for the model, it will
+	 * be split in several chunks with some overlap. This argument controls the size of that
+	 * overlap.
+	 */
+	docStride?: number;
+	/**
+	 * Whether to accept impossible as an answer
+	 */
+	handleImpossibleAnswer?: boolean;
+	/**
+	 * Language to use while running OCR. Defaults to english.
+	 */
+	lang?: string;
+	/**
+	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
+	 * considered).
+	 */
+	maxAnswerLen?: number;
+	/**
+	 * The maximum length of the question after tokenization. It will be truncated if needed.
+	 */
+	maxQuestionLen?: number;
+	/**
+	 * The maximum length of the total sentence (context + question) in tokens of each chunk
+	 * passed to the model. The context will be split in several chunks (using doc_stride as
+	 * overlap) if needed.
+	 */
+	maxSeqLen?: number;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Can return less
+	 * than top_k answers if there are not enough options available within the context.
+	 */
+	topK?: number;
+	/**
+	 * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
+	 * skip the OCR step and use the provided bounding boxes instead.
+	 */
+	wordBoxes?: Array<number[] | string>;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Document Question Answering task
  */
 export interface DocumentQuestionAnsweringOutput {
-    /**
-     * The answer to the question.
-     */
-    answer: string;
-    end:    number;
-    /**
-     * The probability associated to the answer.
-     */
-    score: number;
-    start: number;
-    /**
-     * The index of each word/box pair that is in the answer
-     */
-    words: number[];
-    [property: string]: unknown;
+	/**
+	 * The answer to the question.
+	 */
+	answer: string;
+	end: number;
+	/**
+	 * The probability associated to the answer.
+	 */
+	score: number;
+	start: number;
+	/**
+	 * The index of each word/box pair that is in the answer
+	 */
+	words: number[];
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
index 580a95c92..2161614c4 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -4,9 +4,20 @@
 	"description": "Inputs for Document Question Answering inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "One (document, question) pair to answer",
-			"$ref": "#/$defs/DocumentQuestionAnsweringInputSingle"
+			"type": "object",
+			"title": "DocumentQuestionAnsweringInputData",
+			"properties": {
+				"image": {
+					"description": "The image on which the question is asked"
+				},
+				"question": {
+					"type": "string",
+					"description": "A question to ask of the document"
+				}
+			},
+			"required": ["image", "question"]
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -67,21 +78,7 @@
 					}
 				}
 			}
-		},
-		"DocumentQuestionAnsweringInputSingle": {
-			"type": "object",
-			"title": "DocumentQuestionAnsweringInputSingle",
-			"properties": {
-				"image": {
-					"description": "The image on which the question is asked"
-				},
-				"question": {
-					"type": "string",
-					"description": "A question to ask of the document"
-				}
-			},
-			"required": ["image", "question"]
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
index d8674f516..664b8fa33 100644
--- a/packages/tasks/src/tasks/feature-extraction/inference.ts
+++ b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -1,22 +1,20 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text Embedding inference
  */
 export interface FeatureExtractionInput {
-    /**
-     * The text to get the embeddings of
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * The text to get the embeddings of
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/input.json b/packages/tasks/src/tasks/feature-extraction/spec/input.json
index e2eadc4e9..0170a70cd 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/input.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Text Embedding inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The text to get the embeddings of",
 			"type": "string"
 		},
@@ -21,5 +21,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/fill-mask/inference.ts b/packages/tasks/src/tasks/fill-mask/inference.ts
index e01feec2f..c51ba8ec9 100644
--- a/packages/tasks/src/tasks/fill-mask/inference.ts
+++ b/packages/tasks/src/tasks/fill-mask/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Fill Mask inference
  */
 export interface FillMaskInput {
-    /**
-     * The text with masked tokens
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: FillMaskParameters;
-    [property: string]: unknown;
+	/**
+	 * The text with masked tokens
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: FillMaskParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,39 +25,39 @@ export interface FillMaskInput {
  * Additional inference parameters for Fill Mask
  */
 export interface FillMaskParameters {
-    /**
-     * When passed, the model will limit the scores to the passed targets instead of looking up
-     * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
-     * tokenized and the first resulting token will be used (with a warning, and that might be
-     * slower).
-     */
-    targets?: string[];
-    /**
-     * When passed, overrides the number of predictions to return.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * When passed, the model will limit the scores to the passed targets instead of looking up
+	 * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
+	 * tokenized and the first resulting token will be used (with a warning, and that might be
+	 * slower).
+	 */
+	targets?: string[];
+	/**
+	 * When passed, overrides the number of predictions to return.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Fill Mask task
  */
 export interface FillMaskOutput {
-    /**
-     * The corresponding probability
-     */
-    score: number;
-    /**
-     * The corresponding input with the mask token prediction.
-     */
-    sequence: string;
-    /**
-     * The predicted token id (to replace the masked one).
-     */
-    token: number;
-    /**
-     * The predicted token (to replace the masked one).
-     */
-    tokenStr: string;
-    [property: string]: unknown;
+	/**
+	 * The corresponding probability
+	 */
+	score: number;
+	/**
+	 * The corresponding input with the mask token prediction.
+	 */
+	sequence: string;
+	/**
+	 * The predicted token id (to replace the masked one).
+	 */
+	token: number;
+	/**
+	 * The predicted token (to replace the masked one).
+	 */
+	tokenStr: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/fill-mask/spec/input.json b/packages/tasks/src/tasks/fill-mask/spec/input.json
index 62f935fe2..0174dbd5b 100644
--- a/packages/tasks/src/tasks/fill-mask/spec/input.json
+++ b/packages/tasks/src/tasks/fill-mask/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Fill Mask inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The text with masked tokens",
 			"type": "string"
 		},
@@ -33,5 +33,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index 488531255..de10f4731 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Image Classification inference
  */
 export interface ImageClassificationInput {
-    /**
-     * The input image data
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ImageClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,15 +25,15 @@ export interface ImageClassificationInput {
  * Additional inference parameters for Image Classification
  */
 export interface ImageClassificationParameters {
-    /**
-     * The function to apply to the model outputs in order to retrieve the scores.
-     */
-    functionToApply?: ImageClassificationOutputTransform;
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * The function to apply to the model outputs in order to retrieve the scores.
+	 */
+	functionToApply?: ImageClassificationOutputTransform;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 export type ImageClassificationOutputTransform = "sigmoid" | "softmax" | "none";
@@ -44,13 +42,13 @@ export type ImageClassificationOutputTransform = "sigmoid" | "softmax" | "none";
  * Outputs of inference for the Image Classification task
  */
 export interface ImageClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
index fec2d9aa0..00c6e8b9f 100644
--- a/packages/tasks/src/tasks/image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Image Classification inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input image data"
 		},
 		"parameters": {
@@ -41,5 +41,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index ba31379ef..366c998f3 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Image Segmentation inference
  */
 export interface ImageSegmentationInput {
-    /**
-     * The input image data
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ImageSegmentationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageSegmentationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,23 +25,23 @@ export interface ImageSegmentationInput {
  * Additional inference parameters for Image Segmentation
  */
 export interface ImageSegmentationParameters {
-    /**
-     * Threshold to use when turning the predicted masks into binary values.
-     */
-    maskThreshold?: number;
-    /**
-     * Mask overlap threshold to eliminate small, disconnected segments.
-     */
-    overlapMaskAreaThreshold?: number;
-    /**
-     * Segmentation task to be performed, depending on model capabilities.
-     */
-    subtask?: ImageSegmentationSubtask;
-    /**
-     * Probability threshold to filter out predicted masks.
-     */
-    threshold?: number;
-    [property: string]: unknown;
+	/**
+	 * Threshold to use when turning the predicted masks into binary values.
+	 */
+	maskThreshold?: number;
+	/**
+	 * Mask overlap threshold to eliminate small, disconnected segments.
+	 */
+	overlapMaskAreaThreshold?: number;
+	/**
+	 * Segmentation task to be performed, depending on model capabilities.
+	 */
+	subtask?: ImageSegmentationSubtask;
+	/**
+	 * Probability threshold to filter out predicted masks.
+	 */
+	threshold?: number;
+	[property: string]: unknown;
 }
 
 export type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
@@ -54,17 +52,17 @@ export type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
  * A predicted mask / segment
  */
 export interface ImageSegmentationOutput {
-    /**
-     * The label of the predicted segment
-     */
-    label: string;
-    /**
-     * The corresponding mask as a black-and-white image
-     */
-    mask: unknown;
-    /**
-     * The score or confidence degreee the model has
-     */
-    score?: number;
-    [property: string]: unknown;
+	/**
+	 * The label of the predicted segment
+	 */
+	label: string;
+	/**
+	 * The corresponding mask as a black-and-white image
+	 */
+	mask: unknown;
+	/**
+	 * The score or confidence degreee the model has
+	 */
+	score?: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/input.json b/packages/tasks/src/tasks/image-segmentation/spec/input.json
index 5e2a115e3..cb0c8dd18 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/input.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Image Segmentation inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input image data"
 		},
 		"parameters": {
@@ -49,5 +49,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
index 6ee8d47ba..f05e24b6e 100644
--- a/packages/tasks/src/tasks/image-to-image/inference.ts
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 export type ImageToImageOutput = unknown[];
 
 /**
  * Inputs for Image To Image inference
  */
 export interface ImageToImageInput {
-    /**
-     * The input image data
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: unknown;
-    [property: string]: unknown;
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: unknown;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-image/spec/input.json b/packages/tasks/src/tasks/image-to-image/spec/input.json
index 61653d3c3..f95e74d3d 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Image To Image inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input image data"
 		},
 		"parameters": {
@@ -18,5 +18,5 @@
 			"description": "Additional inference parameters for Image To Image"
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index d4430b5c5..210b2d878 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Image To Text inference
  */
 export interface ImageToTextInput {
-    /**
-     * The input image data
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ImageToTextParameters;
-    [property: string]: unknown;
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageToTextParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,20 +25,20 @@ export interface ImageToTextInput {
  * Additional inference parameters for Image To Text
  */
 export interface ImageToTextParameters {
-    /**
-     * The amount of maximum tokens to generate.
-     */
-    maxNewTokens?: number;
-    [property: string]: unknown;
+	/**
+	 * The amount of maximum tokens to generate.
+	 */
+	maxNewTokens?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Image To Text task
  */
 export interface ImageToTextOutput {
-    /**
-     * The generated text.
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-text/spec/input.json b/packages/tasks/src/tasks/image-to-text/spec/input.json
index 0ae6331c5..a49b445fe 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Image To Text inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input image data"
 		},
 		"parameters": {
@@ -25,5 +25,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index 0d38adb58..f432d2cba 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Object Detection inference
  */
 export interface ObjectDetectionInput {
-    /**
-     * The input image data
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ObjectDetectionParameters;
-    [property: string]: unknown;
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ObjectDetectionParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,31 +25,31 @@ export interface ObjectDetectionInput {
  * Additional inference parameters for Object Detection
  */
 export interface ObjectDetectionParameters {
-    /**
-     * The probability necessary to make a prediction.
-     */
-    threshold?: number;
-    [property: string]: unknown;
+	/**
+	 * The probability necessary to make a prediction.
+	 */
+	threshold?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Object Detection task
  */
 export interface ObjectDetectionOutput {
-    /**
-     * The predicted bounding box. Coordinates are relative to the top left corner of the input
-     * image.
-     */
-    box: BoundingBox;
-    /**
-     * The predicted label for the bounding box
-     */
-    label: string;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
+	 * image.
+	 */
+	box: BoundingBox;
+	/**
+	 * The predicted label for the bounding box
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
 
 /**
@@ -59,9 +57,9 @@ export interface ObjectDetectionOutput {
  * image.
  */
 export interface BoundingBox {
-    xmax: number;
-    xmin: number;
-    ymax: number;
-    ymin: number;
-    [property: string]: unknown;
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/object-detection/spec/input.json b/packages/tasks/src/tasks/object-detection/spec/input.json
index 5055f2e17..8593df43c 100644
--- a/packages/tasks/src/tasks/object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/object-detection/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Object Detection inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input image data"
 		},
 		"parameters": {
@@ -25,5 +25,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/placeholder/spec/input.json b/packages/tasks/src/tasks/placeholder/spec/input.json
index 8dc1c3261..eb8b9b50b 100644
--- a/packages/tasks/src/tasks/placeholder/spec/input.json
+++ b/packages/tasks/src/tasks/placeholder/spec/input.json
@@ -4,19 +4,9 @@
 	"description": "Inputs for <TASK_ID> inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "TODO: describe the input here. This must be model & framework agnostic.",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -40,5 +30,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
index 7c3dd476a..1895b1dd4 100644
--- a/packages/tasks/src/tasks/question-answering/inference.ts
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -1,39 +1,37 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Question Answering inference
  */
 export interface QuestionAnsweringInput {
-    /**
-     * One (context, question) pair to answer
-     */
-    input: Input;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: QuestionAnsweringParameters;
-    [property: string]: unknown;
+	/**
+	 * One (context, question) pair to answer
+	 */
+	data: QuestionAnsweringInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: QuestionAnsweringParameters;
+	[property: string]: unknown;
 }
 
 /**
  * One (context, question) pair to answer
  */
-export interface Input {
-    /**
-     * The context to be used for answering the question
-     */
-    context: string;
-    /**
-     * The question to be answered
-     */
-    question: string;
-    [property: string]: unknown;
+export interface QuestionAnsweringInputData {
+	/**
+	 * The context to be used for answering the question
+	 */
+	context: string;
+	/**
+	 * The question to be answered
+	 */
+	question: string;
+	[property: string]: unknown;
 }
 
 /**
@@ -42,63 +40,63 @@ export interface Input {
  * Additional inference parameters for Question Answering
  */
 export interface QuestionAnsweringParameters {
-    /**
-     * Attempts to align the answer to real words. Improves quality on space separated
-     * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
-     */
-    alignToWords?: boolean;
-    /**
-     * If the context is too long to fit with the question for the model, it will be split in
-     * several chunks with some overlap. This argument controls the size of that overlap.
-     */
-    docStride?: number;
-    /**
-     * Whether to accept impossible as an answer.
-     */
-    handleImpossibleAnswer?: boolean;
-    /**
-     * The maximum length of predicted answers (e.g., only answers with a shorter length are
-     * considered).
-     */
-    maxAnswerLen?: number;
-    /**
-     * The maximum length of the question after tokenization. It will be truncated if needed.
-     */
-    maxQuestionLen?: number;
-    /**
-     * The maximum length of the total sentence (context + question) in tokens of each chunk
-     * passed to the model. The context will be split in several chunks (using docStride as
-     * overlap) if needed.
-     */
-    maxSeqLen?: number;
-    /**
-     * The number of answers to return (will be chosen by order of likelihood). Note that we
-     * return less than topk answers if there are not enough options available within the
-     * context.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * Attempts to align the answer to real words. Improves quality on space separated
+	 * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
+	 */
+	alignToWords?: boolean;
+	/**
+	 * If the context is too long to fit with the question for the model, it will be split in
+	 * several chunks with some overlap. This argument controls the size of that overlap.
+	 */
+	docStride?: number;
+	/**
+	 * Whether to accept impossible as an answer.
+	 */
+	handleImpossibleAnswer?: boolean;
+	/**
+	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
+	 * considered).
+	 */
+	maxAnswerLen?: number;
+	/**
+	 * The maximum length of the question after tokenization. It will be truncated if needed.
+	 */
+	maxQuestionLen?: number;
+	/**
+	 * The maximum length of the total sentence (context + question) in tokens of each chunk
+	 * passed to the model. The context will be split in several chunks (using docStride as
+	 * overlap) if needed.
+	 */
+	maxSeqLen?: number;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Note that we
+	 * return less than topk answers if there are not enough options available within the
+	 * context.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Question Answering task
  */
 export interface QuestionAnsweringOutput {
-    /**
-     * The answer to the question.
-     */
-    answer: string;
-    /**
-     * The character position in the input where the answer ends.
-     */
-    end: number;
-    /**
-     * The probability associated to the answer.
-     */
-    score: number;
-    /**
-     * The character position in the input where the answer begins.
-     */
-    start: number;
-    [property: string]: unknown;
+	/**
+	 * The answer to the question.
+	 */
+	answer: string;
+	/**
+	 * The character position in the input where the answer ends.
+	 */
+	end: number;
+	/**
+	 * The probability associated to the answer.
+	 */
+	score: number;
+	/**
+	 * The character position in the input where the answer begins.
+	 */
+	start: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/question-answering/spec/input.json b/packages/tasks/src/tasks/question-answering/spec/input.json
index da38a8f8a..92484661b 100644
--- a/packages/tasks/src/tasks/question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/question-answering/spec/input.json
@@ -4,7 +4,8 @@
 	"description": "Inputs for Question Answering inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
+			"title": "QuestionAnsweringInputData",
 			"description": "One (context, question) pair to answer",
 			"type": "object",
 			"properties": {
@@ -61,5 +62,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/sentence-similarity/inference.ts b/packages/tasks/src/tasks/sentence-similarity/inference.ts
index 2b0df4011..f1b72447d 100644
--- a/packages/tasks/src/tasks/sentence-similarity/inference.ts
+++ b/packages/tasks/src/tasks/sentence-similarity/inference.ts
@@ -1,34 +1,32 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 export type SentenceSimilarityOutput = number[];
 
 /**
  * Inputs for Sentence similarity inference
  */
 export interface SentenceSimilarityInput {
-    input: InputObject;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	data: SentenceSimilarityInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
-export interface InputObject {
-    /**
-     * A list of strings which will be compared against the source_sentence.
-     */
-    sentences: string[];
-    /**
-     * The string that you wish to compare the other strings with. This can be a phrase,
-     * sentence, or longer passage, depending on the model being used.
-     */
-    sourceSentence: string;
-    [property: string]: unknown;
+export interface SentenceSimilarityInputData {
+	/**
+	 * A list of strings which will be compared against the source_sentence.
+	 */
+	sentences: string[];
+	/**
+	 * The string that you wish to compare the other strings with. This can be a phrase,
+	 * sentence, or longer passage, depending on the model being used.
+	 */
+	sourceSentence: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/sentence-similarity/spec/input.json b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
index 2e85543f8..1141781e0 100644
--- a/packages/tasks/src/tasks/sentence-similarity/spec/input.json
+++ b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Sentence similarity inference",
 	"type": "object",
 	"properties": {
-		"input": {
-			"title": "SentenceSimilarityInput",
+		"data": {
+			"title": "SentenceSimilarityInputData",
 			"type": "object",
 			"properties": {
 				"sourceSentence": {
@@ -35,5 +35,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/sentence-similarity/spec/output.json b/packages/tasks/src/tasks/sentence-similarity/spec/output.json
index e1fc1c9ac..ca13d98bd 100644
--- a/packages/tasks/src/tasks/sentence-similarity/spec/output.json
+++ b/packages/tasks/src/tasks/sentence-similarity/spec/output.json
@@ -6,6 +6,7 @@
 	"type": "array",
 	"items": {
 		"description": "The associated similarity score for each of the given sentences",
-		"type": "number"
+		"type": "number",
+		"title": "SentenceSimilarityScore"
 	}
 }
diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
index 9a063e93c..16d30cf7a 100644
--- a/packages/tasks/src/tasks/summarization/inference.ts
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -1,26 +1,24 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Summarization inference
  *
  * Inputs for Text2text Generation inference
  */
 export interface SummarizationInput {
-    /**
-     * The input text data
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: Text2TextGenerationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Text2TextGenerationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -29,19 +27,19 @@ export interface SummarizationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-    /**
-     * Whether to clean up the potential extra spaces in the text output.
-     */
-    cleanUpTokenizationSpaces?: boolean;
-    /**
-     * Additional parametrization of the text generation algorithm
-     */
-    generateParameters?: { [key: string]: unknown };
-    /**
-     * The truncation strategy to use
-     */
-    truncation?: Text2TextGenerationTruncationStrategy;
-    [property: string]: unknown;
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	generateParameters?: { [key: string]: unknown };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Text2TextGenerationTruncationStrategy;
+	[property: string]: unknown;
 }
 
 export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -52,9 +50,9 @@ export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest
  * Outputs of inference for the Text2text Generation task
  */
 export interface SummarizationOutput {
-    /**
-     * The generated text.
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
index 21cc519b0..836aab94d 100644
--- a/packages/tasks/src/tasks/table-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -1,61 +1,59 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Table Question Answering inference
  */
 export interface TableQuestionAnsweringInput {
-    /**
-     * One (table, question) pair to answer
-     */
-    input: Input;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * One (table, question) pair to answer
+	 */
+	data: TableQuestionAnsweringInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * One (table, question) pair to answer
  */
-export interface Input {
-    /**
-     * The question to be answered about the table
-     */
-    question: string;
-    /**
-     * The table to serve as context for the questions
-     */
-    table: { [key: string]: unknown };
-    [property: string]: unknown;
+export interface TableQuestionAnsweringInputData {
+	/**
+	 * The question to be answered about the table
+	 */
+	question: string;
+	/**
+	 * The table to serve as context for the questions
+	 */
+	table: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Table Question Answering task
  */
 export interface TableQuestionAnsweringOutput {
-    /**
-     * If the model has an aggregator, this returns the aggregator.
-     */
-    aggregator?: string;
-    /**
-     * The answer of the question given the table. If there is an aggregator, the answer will be
-     * preceded by `AGGREGATOR >`.
-     */
-    answer: string;
-    /**
-     * List of strings made up of the answer cell values.
-     */
-    cells: string[];
-    /**
-     * Coordinates of the cells of the answers.
-     */
-    coordinates: Array<number[]>;
-    [property: string]: unknown;
+	/**
+	 * If the model has an aggregator, this returns the aggregator.
+	 */
+	aggregator?: string;
+	/**
+	 * The answer of the question given the table. If there is an aggregator, the answer will be
+	 * preceded by `AGGREGATOR >`.
+	 */
+	answer: string;
+	/**
+	 * List of strings made up of the answer cell values.
+	 */
+	cells: string[];
+	/**
+	 * Coordinates of the cells of the answers.
+	 */
+	coordinates: Array<number[]>;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/input.json b/packages/tasks/src/tasks/table-question-answering/spec/input.json
index 3ceb5c07a..ee6fcbce5 100644
--- a/packages/tasks/src/tasks/table-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/table-question-answering/spec/input.json
@@ -4,8 +4,9 @@
 	"description": "Inputs for Table Question Answering inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "One (table, question) pair to answer",
+			"title": "TableQuestionAnsweringInputData",
 			"type": "object",
 			"properties": {
 				"table": {
@@ -32,5 +33,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
index 6e09d5c7c..5f4f466a0 100644
--- a/packages/tasks/src/tasks/text-classification/inference.ts
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text Classification inference
  */
 export interface TextClassificationInput {
-    /**
-     * The text to classify
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: TextClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * The text to classify
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,15 +25,15 @@ export interface TextClassificationInput {
  * Additional inference parameters for Text Classification
  */
 export interface TextClassificationParameters {
-    /**
-     * The function to apply to the model outputs in order to retrieve the scores.
-     */
-    functionToApply?: TextClassificationOutputTransform;
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * The function to apply to the model outputs in order to retrieve the scores.
+	 */
+	functionToApply?: TextClassificationOutputTransform;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
@@ -44,13 +42,13 @@ export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
  * Outputs of inference for the Text Classification task
  */
 export interface TextClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
index 85c8468ad..26d0bd9f1 100644
--- a/packages/tasks/src/tasks/text-classification/spec/input.json
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Text Classification inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The text to classify",
 			"type": "string"
 		},
@@ -42,5 +42,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index cd83abbb2..13a09ff28 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text Generation inference
  */
 export interface TextGenerationInput {
-    /**
-     * The text to initialize generation with
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: TextGenerationParameters;
-    [property: string]: unknown;
+	/**
+	 * The text to initialize generation with
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextGenerationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,63 +25,63 @@ export interface TextGenerationInput {
  * Additional inference parameters for Text Generation
  */
 export interface TextGenerationParameters {
-    /**
-     * Whether to use logit sampling (true) or greedy search (false).
-     */
-    doSample?: boolean;
-    /**
-     * Maximum number of generated tokens.
-     */
-    maxNewTokens?: number;
-    /**
-     * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
-     * paper](https://hf.co/papers/1909.05858) for more details.
-     */
-    repetitionPenalty?: number;
-    /**
-     * Whether to prepend the prompt to the generated text.
-     */
-    returnFullText?: boolean;
-    /**
-     * Stop generating tokens if a member of `stop_sequences` is generated.
-     */
-    stopSequences?: string[];
-    /**
-     * The value used to modulate the logits distribution.
-     */
-    temperature?: number;
-    /**
-     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
-     */
-    topK?: number;
-    /**
-     * If set to < 1, only the smallest set of most probable tokens with probabilities that add
-     * up to `top_p` or higher are kept for generation.
-     */
-    topP?: number;
-    /**
-     * Truncate input tokens to the given size.
-     */
-    truncate?: number;
-    /**
-     * Typical Decoding mass. See [Typical Decoding for Natural Language
-     * Generation](https://hf.co/papers/2202.00666) for more information
-     */
-    typicalP?: number;
-    /**
-     * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
-     */
-    watermark?: boolean;
-    [property: string]: unknown;
+	/**
+	 * Whether to use logit sampling (true) or greedy search (false).
+	 */
+	doSample?: boolean;
+	/**
+	 * Maximum number of generated tokens.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
+	 * paper](https://hf.co/papers/1909.05858) for more details.
+	 */
+	repetitionPenalty?: number;
+	/**
+	 * Whether to prepend the prompt to the generated text.
+	 */
+	returnFullText?: boolean;
+	/**
+	 * Stop generating tokens if a member of `stop_sequences` is generated.
+	 */
+	stopSequences?: string[];
+	/**
+	 * The value used to modulate the logits distribution.
+	 */
+	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to < 1, only the smallest set of most probable tokens with probabilities that add
+	 * up to `top_p` or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Truncate input tokens to the given size.
+	 */
+	truncate?: number;
+	/**
+	 * Typical Decoding mass. See [Typical Decoding for Natural Language
+	 * Generation](https://hf.co/papers/2202.00666) for more information
+	 */
+	typicalP?: number;
+	/**
+	 * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
+	 */
+	watermark?: boolean;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs for Text Generation inference
  */
 export interface TextGenerationOutput {
-    /**
-     * The generated text
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-generation/spec/input.json b/packages/tasks/src/tasks/text-generation/spec/input.json
index b1cf45995..0c8bf8eaa 100644
--- a/packages/tasks/src/tasks/text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text-generation/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Text Generation inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The text to initialize generation with",
 			"type": "string"
 		},
@@ -69,5 +69,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
index 71e745c21..be2a70bfd 100644
--- a/packages/tasks/src/tasks/text-to-audio/inference.ts
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -1,37 +1,35 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text To Audio inference
  */
 export interface TextToAudioInput {
-    /**
-     * The input text data
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToAudioOutput {
-    /**
-     * The generated audio waveform.
-     */
-    audio: unknown;
-    /**
-     * The sampling rate of the generated audio waveform.
-     */
-    samplingRate: number;
-    [property: string]: unknown;
+	/**
+	 * The generated audio waveform.
+	 */
+	audio: unknown;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	samplingRate: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/input.json b/packages/tasks/src/tasks/text-to-audio/spec/input.json
index a0802c102..5c69ef179 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Text To Audio inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input text data",
 			"type": "string"
 		},
@@ -21,5 +21,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index be5c01981..f119bc62f 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -1,26 +1,24 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text to Speech inference
  *
  * Inputs for Text To Audio inference
  */
 export interface TextToSpeechInput {
-    /**
-     * The input text data
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
@@ -29,13 +27,13 @@ export interface TextToSpeechInput {
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToSpeechOutput {
-    /**
-     * The generated audio waveform.
-     */
-    audio: unknown;
-    /**
-     * The sampling rate of the generated audio waveform.
-     */
-    samplingRate: number;
-    [property: string]: unknown;
+	/**
+	 * The generated audio waveform.
+	 */
+	audio: unknown;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	samplingRate: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index ce6623422..81c160e27 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text2text Generation inference
  */
 export interface Text2TextGenerationInput {
-    /**
-     * The input text data
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: Text2TextGenerationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Text2TextGenerationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,19 +25,19 @@ export interface Text2TextGenerationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-    /**
-     * Whether to clean up the potential extra spaces in the text output.
-     */
-    cleanUpTokenizationSpaces?: boolean;
-    /**
-     * Additional parametrization of the text generation algorithm
-     */
-    generateParameters?: { [key: string]: unknown };
-    /**
-     * The truncation strategy to use
-     */
-    truncation?: Text2TextGenerationTruncationStrategy;
-    [property: string]: unknown;
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	generateParameters?: { [key: string]: unknown };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Text2TextGenerationTruncationStrategy;
+	[property: string]: unknown;
 }
 
 export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -48,9 +46,9 @@ export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest
  * Outputs of inference for the Text2text Generation task
  */
 export interface Text2TextGenerationOutput {
-    /**
-     * The generated text.
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/input.json b/packages/tasks/src/tasks/text2text-generation/spec/input.json
index 495b5a281..e54834e99 100644
--- a/packages/tasks/src/tasks/text2text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text2text-generation/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Text2text Generation inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input text data",
 			"type": "string"
 		},
@@ -50,5 +50,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
index 629c47c47..4584ca51d 100644
--- a/packages/tasks/src/tasks/token-classification/inference.ts
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Token Classification inference
  */
 export interface TokenClassificationInput {
-    /**
-     * The input text data
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: TokenClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TokenClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,19 +25,19 @@ export interface TokenClassificationInput {
  * Additional inference parameters for Token Classification
  */
 export interface TokenClassificationParameters {
-    /**
-     * The strategy used to fuse tokens based on model predictions
-     */
-    aggregationStrategy?: TokenClassificationAggregationStrategy;
-    /**
-     * A list of labels to ignore
-     */
-    ignoreLabels?: string[];
-    /**
-     * The number of overlapping tokens between chunks when splitting the input text.
-     */
-    stride?: number;
-    [property: string]: unknown;
+	/**
+	 * The strategy used to fuse tokens based on model predictions
+	 */
+	aggregationStrategy?: TokenClassificationAggregationStrategy;
+	/**
+	 * A list of labels to ignore
+	 */
+	ignoreLabels?: string[];
+	/**
+	 * The number of overlapping tokens between chunks when splitting the input text.
+	 */
+	stride?: number;
+	[property: string]: unknown;
 }
 
 /**
@@ -62,26 +60,26 @@ export type TokenClassificationAggregationStrategy = "none" | "simple" | "first"
  * Outputs of inference for the Token Classification task
  */
 export interface TokenClassificationOutput {
-    /**
-     * The character position in the input where this group ends.
-     */
-    end?: number;
-    /**
-     * The predicted label for that group of tokens
-     */
-    entityGroup?: string;
-    label:        unknown;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    /**
-     * The character position in the input where this group begins.
-     */
-    start?: number;
-    /**
-     * The corresponding text
-     */
-    word?: string;
-    [property: string]: unknown;
+	/**
+	 * The character position in the input where this group ends.
+	 */
+	end?: number;
+	/**
+	 * The predicted label for that group of tokens
+	 */
+	entityGroup?: string;
+	label: unknown;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	/**
+	 * The character position in the input where this group begins.
+	 */
+	start?: number;
+	/**
+	 * The corresponding text
+	 */
+	word?: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/token-classification/spec/input.json b/packages/tasks/src/tasks/token-classification/spec/input.json
index f46b20cf9..9b59fcb79 100644
--- a/packages/tasks/src/tasks/token-classification/spec/input.json
+++ b/packages/tasks/src/tasks/token-classification/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Token Classification inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input text data",
 			"type": "string"
 		},
@@ -60,5 +60,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
index 96090808b..c932617a4 100644
--- a/packages/tasks/src/tasks/translation/inference.ts
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -1,26 +1,24 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Translation inference
  *
  * Inputs for Text2text Generation inference
  */
 export interface TranslationInput {
-    /**
-     * The input text data
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: Text2TextGenerationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Text2TextGenerationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -29,19 +27,19 @@ export interface TranslationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-    /**
-     * Whether to clean up the potential extra spaces in the text output.
-     */
-    cleanUpTokenizationSpaces?: boolean;
-    /**
-     * Additional parametrization of the text generation algorithm
-     */
-    generateParameters?: { [key: string]: unknown };
-    /**
-     * The truncation strategy to use
-     */
-    truncation?: Text2TextGenerationTruncationStrategy;
-    [property: string]: unknown;
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	generateParameters?: { [key: string]: unknown };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Text2TextGenerationTruncationStrategy;
+	[property: string]: unknown;
 }
 
 export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -52,9 +50,9 @@ export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest
  * Outputs of inference for the Text2text Generation task
  */
 export interface TranslationOutput {
-    /**
-     * The generated text.
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
index 2d258b33b..1914bfda6 100644
--- a/packages/tasks/src/tasks/video-classification/inference.ts
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Video Classification inference
  */
 export interface VideoClassificationInput {
-    /**
-     * One or several videos to be classified
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: VideoClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input video data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: VideoClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,23 +25,23 @@ export interface VideoClassificationInput {
  * Additional inference parameters for Video Classification
  */
 export interface VideoClassificationParameters {
-    /**
-     * The sampling rate used to select frames from the video.
-     */
-    frameSamplingRate?: number;
-    /**
-     * The function to apply to the model outputs in order to retrieve the scores.
-     */
-    functionToApply?: TextClassificationOutputTransform;
-    /**
-     * The number of sampled frames to consider for classification.
-     */
-    numFrames?: number;
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * The sampling rate used to select frames from the video.
+	 */
+	frameSamplingRate?: number;
+	/**
+	 * The function to apply to the model outputs in order to retrieve the scores.
+	 */
+	functionToApply?: TextClassificationOutputTransform;
+	/**
+	 * The number of sampled frames to consider for classification.
+	 */
+	numFrames?: number;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
@@ -52,13 +50,13 @@ export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
  * Outputs of inference for the Video Classification task
  */
 export interface VideoClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/video-classification/spec/input.json b/packages/tasks/src/tasks/video-classification/spec/input.json
index 796ce393f..c05a8b111 100644
--- a/packages/tasks/src/tasks/video-classification/spec/input.json
+++ b/packages/tasks/src/tasks/video-classification/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Video Classification inference",
 	"type": "object",
 	"properties": {
-		"input": {
-			"description": "One or several videos to be classified"
+		"data": {
+			"description": "The input video data"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -49,5 +49,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
index 7d192a33d..0b0ee2e5a 100644
--- a/packages/tasks/src/tasks/visual-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -1,39 +1,37 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Visual Question Answering inference
  */
 export interface VisualQuestionAnsweringInput {
-    /**
-     * One (image, question) pair to answer
-     */
-    input: VisualQuestionAnsweringInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: VisualQuestionAnsweringParameters;
-    [property: string]: unknown;
+	/**
+	 * One (image, question) pair to answer
+	 */
+	data: VisualQuestionAnsweringInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: VisualQuestionAnsweringParameters;
+	[property: string]: unknown;
 }
 
 /**
  * One (image, question) pair to answer
  */
-export interface VisualQuestionAnsweringInputSingle {
-    /**
-     * The image.
-     */
-    image: unknown;
-    /**
-     * The question to answer based on the image.
-     */
-    question: unknown;
-    [property: string]: unknown;
+export interface VisualQuestionAnsweringInputData {
+	/**
+	 * The image.
+	 */
+	image: unknown;
+	/**
+	 * The question to answer based on the image.
+	 */
+	question: unknown;
+	[property: string]: unknown;
 }
 
 /**
@@ -42,27 +40,27 @@ export interface VisualQuestionAnsweringInputSingle {
  * Additional inference parameters for Visual Question Answering
  */
 export interface VisualQuestionAnsweringParameters {
-    /**
-     * The number of answers to return (will be chosen by order of likelihood). Note that we
-     * return less than topk answers if there are not enough options available within the
-     * context.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Note that we
+	 * return less than topk answers if there are not enough options available within the
+	 * context.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Visual Question Answering task
  */
 export interface VisualQuestionAnsweringOutput {
-    /**
-     * The answer to the question
-     */
-    answer?: string;
-    label:   unknown;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The answer to the question
+	 */
+	answer?: string;
+	label: unknown;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/input.json b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
index 2e77422d9..3a54c69fa 100644
--- a/packages/tasks/src/tasks/visual-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
@@ -4,10 +4,10 @@
 	"description": "Inputs for Visual Question Answering inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "One (image, question) pair to answer",
 			"type": "object",
-			"title": "VisualQuestionAnsweringInputSingle",
+			"title": "VisualQuestionAnsweringInputData",
 			"properties": {
 				"image": {
 					"description": "The image."
@@ -36,5 +36,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
index 578f24946..369474a6d 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -1,39 +1,37 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Zero Shot Classification inference
  */
 export interface ZeroShotClassificationInput {
-    /**
-     * The input text data, with candidate labels
-     */
-    input: InputObject;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ZeroShotClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input text data, with candidate labels
+	 */
+	data: ZeroShotClassificationInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ZeroShotClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
  * The input text data, with candidate labels
  */
-export interface InputObject {
-    /**
-     * The set of possible class labels to classify the text into.
-     */
-    candidateLabels: string[];
-    /**
-     * The text to classify
-     */
-    text: string;
-    [property: string]: unknown;
+export interface ZeroShotClassificationInputData {
+	/**
+	 * The set of possible class labels to classify the text into.
+	 */
+	candidateLabels: string[];
+	/**
+	 * The text to classify
+	 */
+	text: string;
+	[property: string]: unknown;
 }
 
 /**
@@ -42,31 +40,31 @@ export interface InputObject {
  * Additional inference parameters for Zero Shot Classification
  */
 export interface ZeroShotClassificationParameters {
-    /**
-     * The sentence used in conjunction with candidateLabels to attempt the text classification
-     * by replacing the placeholder with the candidate labels.
-     */
-    hypothesisTemplate?: string;
-    /**
-     * Whether multiple candidate labels can be true. If false, the scores are normalized such
-     * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
-     * considered independent and probabilities are normalized for each candidate.
-     */
-    multiLabel?: boolean;
-    [property: string]: unknown;
+	/**
+	 * The sentence used in conjunction with candidateLabels to attempt the text classification
+	 * by replacing the placeholder with the candidate labels.
+	 */
+	hypothesisTemplate?: string;
+	/**
+	 * Whether multiple candidate labels can be true. If false, the scores are normalized such
+	 * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
+	 * considered independent and probabilities are normalized for each candidate.
+	 */
+	multiLabel?: boolean;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Classification task
  */
 export interface ZeroShotClassificationOutput {
-    /**
-     * A candidate label
-     */
-    label: string;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
index ce10d0b61..d4d0ba00b 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
@@ -4,10 +4,10 @@
 	"description": "Inputs for Zero Shot Classification inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input text data, with candidate labels",
 			"type": "object",
-			"title": "ZeroShotClassificationInput",
+			"title": "ZeroShotClassificationInputData",
 			"properties": {
 				"text": {
 					"type": "string",
@@ -45,5 +45,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 970d7708b..65649ff5a 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -1,39 +1,37 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Zero Shot Image Classification inference
  */
 export interface ZeroShotImageClassificationInput {
-    /**
-     * The input image data, with candidate labels
-     */
-    input: ZeroShotImageClassificationInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ZeroShotImageClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input image data, with candidate labels
+	 */
+	data: ZeroShotImageClassificationInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ZeroShotImageClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
  * The input image data, with candidate labels
  */
-export interface ZeroShotImageClassificationInputSingle {
-    /**
-     * The candidate labels for this image
-     */
-    candidateLabels: string[];
-    /**
-     * The image data to classify
-     */
-    image: unknown;
-    [property: string]: unknown;
+export interface ZeroShotImageClassificationInputData {
+	/**
+	 * The candidate labels for this image
+	 */
+	candidateLabels: string[];
+	/**
+	 * The image data to classify
+	 */
+	image: unknown;
+	[property: string]: unknown;
 }
 
 /**
@@ -42,25 +40,25 @@ export interface ZeroShotImageClassificationInputSingle {
  * Additional inference parameters for Zero Shot Image Classification
  */
 export interface ZeroShotImageClassificationParameters {
-    /**
-     * The sentence used in conjunction with candidateLabels to attempt the text classification
-     * by replacing the placeholder with the candidate labels.
-     */
-    hypothesisTemplate?: string;
-    [property: string]: unknown;
+	/**
+	 * The sentence used in conjunction with candidateLabels to attempt the text classification
+	 * by replacing the placeholder with the candidate labels.
+	 */
+	hypothesisTemplate?: string;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Image Classification task
  */
 export interface ZeroShotImageClassificationOutput {
-    /**
-     * A candidate label
-     */
-    label: string;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
index 07fffa5ef..44102978e 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
@@ -4,10 +4,10 @@
 	"description": "Inputs for Zero Shot Image Classification inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input image data, with candidate labels",
 			"type": "object",
-			"title": "ZeroShotImageClassificationInputSingle",
+			"title": "ZeroShotImageClassificationInputData",
 			"properties": {
 				"image": {
 					"description": "The image data to classify"
@@ -40,5 +40,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index 2e3e12f74..987662e24 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -1,59 +1,57 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Zero Shot Object Detection inference
  */
 export interface ZeroShotObjectDetectionInput {
-    /**
-     * The input image data, with candidate labels
-     */
-    input: ZeroShotObjectDetectionInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * The input image data, with candidate labels
+	 */
+	data: ZeroShotObjectDetectionInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * The input image data, with candidate labels
  */
-export interface ZeroShotObjectDetectionInputSingle {
-    /**
-     * The candidate labels for this image
-     */
-    candidateLabels: string[];
-    /**
-     * The image data to generate bounding boxes from
-     */
-    image: unknown;
-    [property: string]: unknown;
+export interface ZeroShotObjectDetectionInputData {
+	/**
+	 * The candidate labels for this image
+	 */
+	candidateLabels: string[];
+	/**
+	 * The image data to generate bounding boxes from
+	 */
+	image: unknown;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Object Detection task
  */
 export interface ZeroShotObjectDetectionOutput {
-    /**
-     * The predicted bounding box. Coordinates are relative to the top left corner of the input
-     * image.
-     */
-    box: BoundingBox;
-    /**
-     * A candidate label
-     */
-    label: string;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
+	 * image.
+	 */
+	box: BoundingBox;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
 
 /**
@@ -61,9 +59,9 @@ export interface ZeroShotObjectDetectionOutput {
  * image.
  */
 export interface BoundingBox {
-    xmax: number;
-    xmin: number;
-    ymax: number;
-    ymin: number;
-    [property: string]: unknown;
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
index 7f72f3f9b..417dc0a78 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
@@ -4,10 +4,10 @@
 	"description": "Inputs for Zero Shot Object Detection inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input image data, with candidate labels",
 			"type": "object",
-			"title": "ZeroShotObjectDetectionInputSingle",
+			"title": "ZeroShotObjectDetectionInputData",
 			"properties": {
 				"image": {
 					"description": "The image data to generate bounding boxes from"
@@ -35,5 +35,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }

From 3a98f588031af625b798bd5a817052014d20b3e9 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Thu, 25 Jan 2024 11:44:19 +0100
Subject: [PATCH 32/51] enable explicit-unions when generating

---
 packages/tasks/src/scripts/inference-codegen.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index aa92ba5a4..fedb95a64 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -62,7 +62,7 @@ async function generateTypescript(inputData: InputData): Promise<SerializedRende
 			"prefer-unions": true,
 			"prefer-const-values": true,
 			"prefer-unknown": true,
-			// "explicit-unions": true,
+			"explicit-unions": true,
 		},
 	});
 }

From e0a493957ddc587b814144694cdc40e147186a94 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Thu, 25 Jan 2024 11:46:59 +0100
Subject: [PATCH 33/51] tweaks

---
 .../document-question-answering/inference.ts  |  4 +-
 .../src/tasks/feature-extraction/inference.ts |  2 +
 .../tasks/feature-extraction/spec/input.json  |  2 +-
 .../tasks/feature-extraction/spec/output.json | 48 +------------------
 4 files changed, 8 insertions(+), 48 deletions(-)

diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index 73eb58f1c..4502a8ffb 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -78,10 +78,12 @@ export interface DocumentQuestionAnsweringParameters {
 	 * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
 	 * skip the OCR step and use the provided bounding boxes instead.
 	 */
-	wordBoxes?: Array<number[] | string>;
+	wordBoxes?: WordBox[];
 	[property: string]: unknown;
 }
 
+export type WordBox = number[] | string;
+
 /**
  * Outputs of inference for the Document Question Answering task
  */
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
index 664b8fa33..22dc8dd1d 100644
--- a/packages/tasks/src/tasks/feature-extraction/inference.ts
+++ b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -4,6 +4,8 @@
  * Using src/scripts/inference-codegen
  */
 
+export type FeatureExtractionOutput = unknown[];
+
 /**
  * Inputs for Text Embedding inference
  */
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/input.json b/packages/tasks/src/tasks/feature-extraction/spec/input.json
index 0170a70cd..8bea845e6 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/input.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"$id": "/inference/schemas/text-embedding/input.json",
+	"$id": "/inference/schemas/feature-extraction/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text Embedding inference",
 	"type": "object",
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/output.json b/packages/tasks/src/tasks/feature-extraction/spec/output.json
index 54a29d10e..b51788daa 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/output.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/output.json
@@ -1,51 +1,7 @@
 {
 	"$id": "/inference/schemas/feature-extraction/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "Outputs of inference for the Feature Extraction task",
+	"description": "The embedding for the input text, as a nested list (tensor) of floats",
 	"type": "array",
-	"title": "FeatureExtractionOutput",
-	"items": {
-		"description": "The features computed by the mode, as a nested list of floats",
-		"anyOf": [
-			{
-				"type": "number"
-			},
-			{
-				"type": "array",
-				"items": {
-					"anyOf": [
-						{
-							"type": "number"
-						},
-						{
-							"type": "array",
-							"items": {
-								"anyOf": [
-									{
-										"type": "number"
-									},
-									{
-										"type": "array",
-										"items": {
-											"anyOf": [
-												{
-													"type": "number"
-												},
-												{
-													"type": "array",
-													"items": {
-														"type": "number"
-													}
-												}
-											]
-										}
-									}
-								]
-							}
-						}
-					]
-				}
-			}
-		]
-	}
+	"title": "FeatureExtractionOutput"
 }

From 2d463999beb8e4b8054ebf7a87e8bf1459cbd37e Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 10:40:01 +0100
Subject: [PATCH 34/51] =?UTF-8?q?=F0=9F=A9=B9=20Don't=20use=20require=20in?=
 =?UTF-8?q?=20rootDirFinder?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/src/scripts/inference-codegen.ts | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index fedb95a64..1988c4c0d 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -19,11 +19,8 @@ const rootDirFinder = function (): string {
 	while (level > 0) {
 		const currentPath = parts.slice(0, level).join("/");
 		console.debug(currentPath);
-		try {
-			require(`${currentPath}/package.json`);
+		if (pathExists(`${currentPath}/package.json`)) {
 			return path.normalize(currentPath);
-		} catch (err) {
-			/// noop
 		}
 		level--;
 	}

From c1151c0caf608edb01c90ff9726b728909cbb2f8 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 11:04:46 +0100
Subject: [PATCH 35/51] Explicit titles

---
 packages/tasks/src/tasks/audio-classification/spec/input.json    | 1 +
 .../tasks/src/tasks/automatic-speech-recognition/spec/input.json | 1 +
 .../src/tasks/automatic-speech-recognition/spec/output.json      | 1 +
 packages/tasks/src/tasks/depth-estimation/spec/input.json        | 1 +
 packages/tasks/src/tasks/depth-estimation/spec/output.json       | 1 +
 .../tasks/src/tasks/document-question-answering/spec/input.json  | 1 +
 .../tasks/src/tasks/document-question-answering/spec/output.json | 1 +
 packages/tasks/src/tasks/feature-extraction/spec/input.json      | 1 +
 packages/tasks/src/tasks/fill-mask/spec/input.json               | 1 +
 packages/tasks/src/tasks/fill-mask/spec/output.json              | 1 +
 packages/tasks/src/tasks/image-classification/spec/input.json    | 1 +
 packages/tasks/src/tasks/image-classification/spec/output.json   | 1 +
 packages/tasks/src/tasks/image-segmentation/spec/input.json      | 1 +
 packages/tasks/src/tasks/image-segmentation/spec/output.json     | 1 +
 packages/tasks/src/tasks/image-to-image/spec/input.json          | 1 +
 packages/tasks/src/tasks/image-to-image/spec/output.json         | 1 +
 packages/tasks/src/tasks/image-to-text/spec/input.json           | 1 +
 packages/tasks/src/tasks/image-to-text/spec/output.json          | 1 +
 packages/tasks/src/tasks/object-detection/spec/input.json        | 1 +
 packages/tasks/src/tasks/object-detection/spec/output.json       | 1 +
 packages/tasks/src/tasks/placeholder/spec/input.json             | 1 +
 packages/tasks/src/tasks/placeholder/spec/output.json            | 1 +
 packages/tasks/src/tasks/question-answering/spec/input.json      | 1 +
 packages/tasks/src/tasks/sentence-similarity/spec/input.json     | 1 +
 packages/tasks/src/tasks/summarization/spec/input.json           | 1 +
 packages/tasks/src/tasks/summarization/spec/output.json          | 1 +
 .../tasks/src/tasks/table-question-answering/spec/input.json     | 1 +
 .../tasks/src/tasks/table-question-answering/spec/output.json    | 1 +
 packages/tasks/src/tasks/text-classification/spec/input.json     | 1 +
 packages/tasks/src/tasks/text-classification/spec/output.json    | 1 +
 packages/tasks/src/tasks/text-generation/spec/input.json         | 1 +
 packages/tasks/src/tasks/text-generation/spec/output.json        | 1 +
 packages/tasks/src/tasks/text-to-audio/spec/input.json           | 1 +
 packages/tasks/src/tasks/text-to-audio/spec/output.json          | 1 +
 packages/tasks/src/tasks/text-to-speech/spec/input.json          | 1 +
 packages/tasks/src/tasks/text-to-speech/spec/output.json         | 1 +
 packages/tasks/src/tasks/text2text-generation/spec/input.json    | 1 +
 packages/tasks/src/tasks/text2text-generation/spec/output.json   | 1 +
 packages/tasks/src/tasks/token-classification/spec/input.json    | 1 +
 packages/tasks/src/tasks/token-classification/spec/output.json   | 1 +
 packages/tasks/src/tasks/translation/spec/input.json             | 1 +
 packages/tasks/src/tasks/translation/spec/output.json            | 1 +
 packages/tasks/src/tasks/video-classification/spec/input.json    | 1 +
 packages/tasks/src/tasks/video-classification/spec/output.json   | 1 +
 .../tasks/src/tasks/visual-question-answering/spec/input.json    | 1 +
 .../tasks/src/tasks/visual-question-answering/spec/output.json   | 1 +
 .../tasks/src/tasks/zero-shot-classification/spec/input.json     | 1 +
 .../tasks/src/tasks/zero-shot-classification/spec/output.json    | 1 +
 .../src/tasks/zero-shot-image-classification/spec/input.json     | 1 +
 .../src/tasks/zero-shot-image-classification/spec/output.json    | 1 +
 .../tasks/src/tasks/zero-shot-object-detection/spec/input.json   | 1 +
 .../tasks/src/tasks/zero-shot-object-detection/spec/output.json  | 1 +
 52 files changed, 52 insertions(+)

diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index d6cc4516c..80e8651fe 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/audio-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Audio Classification inference",
+	"title": "AudioClassificationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
index be2471966..f44075d56 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/automatic-speech-recognition/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Automatic Speech Recognition inference",
+	"title": "AutomaticSpeechRecognitionInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
index a8b8af782..72573986d 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/automatic-speech-recognition/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Automatic Speech Recognition task",
+	"title": "AutomaticSpeechRecognitionOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/input.json b/packages/tasks/src/tasks/depth-estimation/spec/input.json
index e5553f126..3d58c82ff 100644
--- a/packages/tasks/src/tasks/depth-estimation/spec/input.json
+++ b/packages/tasks/src/tasks/depth-estimation/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/depth-estimation/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Depth Estimation inference",
+	"title": "DepthEstimationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/output.json b/packages/tasks/src/tasks/depth-estimation/spec/output.json
index c3ebebcc5..72d6a714d 100644
--- a/packages/tasks/src/tasks/depth-estimation/spec/output.json
+++ b/packages/tasks/src/tasks/depth-estimation/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/depth-estimation/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Depth Estimation task",
+	"title": "DepthEstimationOutput",
 	"type": "array",
 	"items": {
 		"description": "The output depth labels"
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
index 2161614c4..a607735e7 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/document-question-answering/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Document Question Answering inference",
+	"title": "DocumentQuestionAnsweringInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/output.json b/packages/tasks/src/tasks/document-question-answering/spec/output.json
index 4c7752775..9f69584ae 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/output.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/document-question-answering/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Document Question Answering task",
+	"title": "DocumentQuestionAnsweringOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/input.json b/packages/tasks/src/tasks/feature-extraction/spec/input.json
index 8bea845e6..a61455f6c 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/input.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/feature-extraction/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text Embedding inference",
+	"title": "FeatureExtractionInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/fill-mask/spec/input.json b/packages/tasks/src/tasks/fill-mask/spec/input.json
index 0174dbd5b..00def602e 100644
--- a/packages/tasks/src/tasks/fill-mask/spec/input.json
+++ b/packages/tasks/src/tasks/fill-mask/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/fill-mask/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Fill Mask inference",
+	"title": "FillMaskInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/fill-mask/spec/output.json b/packages/tasks/src/tasks/fill-mask/spec/output.json
index 3453d65d4..f8e91aeea 100644
--- a/packages/tasks/src/tasks/fill-mask/spec/output.json
+++ b/packages/tasks/src/tasks/fill-mask/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/fill-mask/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Fill Mask task",
+	"title": "FillMaskOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
index 00c6e8b9f..1dee66b97 100644
--- a/packages/tasks/src/tasks/image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/image-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Image Classification inference",
+	"title": "ImageClassificationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/image-classification/spec/output.json b/packages/tasks/src/tasks/image-classification/spec/output.json
index da8a2a5c7..a875898b6 100644
--- a/packages/tasks/src/tasks/image-classification/spec/output.json
+++ b/packages/tasks/src/tasks/image-classification/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/image-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image Classification task",
+	"title": "ImageClassificationOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/input.json b/packages/tasks/src/tasks/image-segmentation/spec/input.json
index cb0c8dd18..ae4adc70e 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/input.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/image-segmentation/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Image Segmentation inference",
+	"title": "ImageSegmentationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/output.json b/packages/tasks/src/tasks/image-segmentation/spec/output.json
index 4b7cb643c..b20aa415e 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/output.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/image-segmentation/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image Segmentation task",
+	"title": "ImageSegmentationOutput",
 	"type": "array",
 	"items": {
 		"description": "A predicted mask / segment",
diff --git a/packages/tasks/src/tasks/image-to-image/spec/input.json b/packages/tasks/src/tasks/image-to-image/spec/input.json
index f95e74d3d..d91d6e6d4 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/image-to-image/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Image To Image inference",
+	"title": "ImageToImageInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/image-to-image/spec/output.json b/packages/tasks/src/tasks/image-to-image/spec/output.json
index d9c4f9bf2..5e55f5677 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/output.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/image-to-image/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image To Image task",
+	"title": "ImageToImageOutput",
 	"type": "array",
 	"items": {
 		"description": "The output image"
diff --git a/packages/tasks/src/tasks/image-to-text/spec/input.json b/packages/tasks/src/tasks/image-to-text/spec/input.json
index a49b445fe..f06eb59f0 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/image-to-text/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Image To Text inference",
+	"title": "ImageToTextInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/image-to-text/spec/output.json b/packages/tasks/src/tasks/image-to-text/spec/output.json
index 81960cd22..e3283e34f 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/output.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/image-to-text/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image To Text task",
+	"title": "ImageToTextOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/object-detection/spec/input.json b/packages/tasks/src/tasks/object-detection/spec/input.json
index 8593df43c..e01ebf496 100644
--- a/packages/tasks/src/tasks/object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/object-detection/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/object-detection/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Object Detection inference",
+	"title": "ObjectDetectionInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/object-detection/spec/output.json b/packages/tasks/src/tasks/object-detection/spec/output.json
index 450d96ed2..20c92d5d3 100644
--- a/packages/tasks/src/tasks/object-detection/spec/output.json
+++ b/packages/tasks/src/tasks/object-detection/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/object-detection/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Object Detection task",
+	"title": "ObjectDetectionOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/placeholder/spec/input.json b/packages/tasks/src/tasks/placeholder/spec/input.json
index eb8b9b50b..5c206baef 100644
--- a/packages/tasks/src/tasks/placeholder/spec/input.json
+++ b/packages/tasks/src/tasks/placeholder/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/<TASK_ID>/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for <TASK_ID> inference",
+	"title": "PlaceholderInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/placeholder/spec/output.json b/packages/tasks/src/tasks/placeholder/spec/output.json
index b4b4225f6..8e3e13294 100644
--- a/packages/tasks/src/tasks/placeholder/spec/output.json
+++ b/packages/tasks/src/tasks/placeholder/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/<TASK_ID>/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs for <TASK_ID> inference",
+	"title": "PlaceholderOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/question-answering/spec/input.json b/packages/tasks/src/tasks/question-answering/spec/input.json
index 92484661b..088e77200 100644
--- a/packages/tasks/src/tasks/question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/question-answering/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/question-answering/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Question Answering inference",
+	"title": "QuestionAnsweringInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/sentence-similarity/spec/input.json b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
index 1141781e0..8bb9e2e5a 100644
--- a/packages/tasks/src/tasks/sentence-similarity/spec/input.json
+++ b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/sentence-similarity/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Sentence similarity inference",
+	"title": "SentenceSimilarityInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/summarization/spec/input.json b/packages/tasks/src/tasks/summarization/spec/input.json
index b7c09d1db..629da31ea 100644
--- a/packages/tasks/src/tasks/summarization/spec/input.json
+++ b/packages/tasks/src/tasks/summarization/spec/input.json
@@ -2,5 +2,6 @@
 	"$ref": "/inference/schemas/text2text-generation/input.json",
 	"$id": "/inference/schemas/summarization/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "SummarizationInput",
 	"description": "Inputs for Summarization inference"
 }
diff --git a/packages/tasks/src/tasks/summarization/spec/output.json b/packages/tasks/src/tasks/summarization/spec/output.json
index df7331ee6..9b1f8bf30 100644
--- a/packages/tasks/src/tasks/summarization/spec/output.json
+++ b/packages/tasks/src/tasks/summarization/spec/output.json
@@ -2,5 +2,6 @@
 	"$ref": "/inference/schemas/text2text-generation/output.json",
 	"$id": "/inference/schemas/summarization/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "SummarizationOutput",
 	"description": "Outputs for Summarization inference"
 }
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/input.json b/packages/tasks/src/tasks/table-question-answering/spec/input.json
index ee6fcbce5..e3fc6db9f 100644
--- a/packages/tasks/src/tasks/table-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/table-question-answering/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/table-question-answering/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Table Question Answering inference",
+	"title": "TableQuestionAnsweringInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/output.json b/packages/tasks/src/tasks/table-question-answering/spec/output.json
index 864900647..9b43026ea 100644
--- a/packages/tasks/src/tasks/table-question-answering/spec/output.json
+++ b/packages/tasks/src/tasks/table-question-answering/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/table-question-answering/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Table Question Answering task",
+	"title": "TableQuestionAnsweringOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
index 26d0bd9f1..08bac5953 100644
--- a/packages/tasks/src/tasks/text-classification/spec/input.json
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/text-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text Classification inference",
+	"title": "TextClassificationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/text-classification/spec/output.json b/packages/tasks/src/tasks/text-classification/spec/output.json
index 4e6d69ed9..b2b81acde 100644
--- a/packages/tasks/src/tasks/text-classification/spec/output.json
+++ b/packages/tasks/src/tasks/text-classification/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/text-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Text Classification task",
+	"title": "TextClassificationOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/text-generation/spec/input.json b/packages/tasks/src/tasks/text-generation/spec/input.json
index 0c8bf8eaa..223561691 100644
--- a/packages/tasks/src/tasks/text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text-generation/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/text-generation/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text Generation inference",
+	"title": "TextGenerationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/text-generation/spec/output.json b/packages/tasks/src/tasks/text-generation/spec/output.json
index 4f1eb95e5..eacb907e2 100644
--- a/packages/tasks/src/tasks/text-generation/spec/output.json
+++ b/packages/tasks/src/tasks/text-generation/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/text-generation/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs for Text Generation inference",
+	"title": "TextGenerationOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/input.json b/packages/tasks/src/tasks/text-to-audio/spec/input.json
index 5c69ef179..176060962 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/text-to-audio/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text To Audio inference",
+	"title": "TextToAudioInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/output.json b/packages/tasks/src/tasks/text-to-audio/spec/output.json
index f91a9563e..b0a25bd9a 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/output.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/text-to-audio/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Text To Audio task",
+	"title": "TextToAudioOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/input.json b/packages/tasks/src/tasks/text-to-speech/spec/input.json
index dffbf7910..7d2bac092 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/input.json
@@ -2,5 +2,6 @@
 	"$ref": "/inference/schemas/text-to-audio/input.json",
 	"$id": "/inference/schemas/text-to-speech/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "TextToSpeechInput",
 	"description": "Inputs for Text to Speech inference"
 }
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/output.json b/packages/tasks/src/tasks/text-to-speech/spec/output.json
index 4678592e8..91654e2b5 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/output.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/output.json
@@ -2,5 +2,6 @@
 	"$ref": "/inference/schemas/text-to-audio/output.json",
 	"$id": "/inference/schemas/text-to-speech/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "TextToSpeechOutput",
 	"description": "Outputs for Text to Speech inference"
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/input.json b/packages/tasks/src/tasks/text2text-generation/spec/input.json
index e54834e99..a00ae575f 100644
--- a/packages/tasks/src/tasks/text2text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text2text-generation/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/text2text-generation/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text2text Generation inference",
+	"title": "Text2TextGenerationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/output.json b/packages/tasks/src/tasks/text2text-generation/spec/output.json
index 190aa6014..f60ba8933 100644
--- a/packages/tasks/src/tasks/text2text-generation/spec/output.json
+++ b/packages/tasks/src/tasks/text2text-generation/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/text2text-generation/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Text2text Generation task",
+	"title": "Text2TextGenerationOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/token-classification/spec/input.json b/packages/tasks/src/tasks/token-classification/spec/input.json
index 9b59fcb79..2fd89ce34 100644
--- a/packages/tasks/src/tasks/token-classification/spec/input.json
+++ b/packages/tasks/src/tasks/token-classification/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/token-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Token Classification inference",
+	"title": "TokenClassificationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/token-classification/spec/output.json b/packages/tasks/src/tasks/token-classification/spec/output.json
index 7685b740b..8522d972a 100644
--- a/packages/tasks/src/tasks/token-classification/spec/output.json
+++ b/packages/tasks/src/tasks/token-classification/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/token-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Token Classification task",
+	"title": "TokenClassificationOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/translation/spec/input.json b/packages/tasks/src/tasks/translation/spec/input.json
index e3aac752c..0695bc672 100644
--- a/packages/tasks/src/tasks/translation/spec/input.json
+++ b/packages/tasks/src/tasks/translation/spec/input.json
@@ -2,5 +2,6 @@
 	"$ref": "/inference/schemas/text2text-generation/input.json",
 	"$id": "/inference/schemas/translation/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "TranslationInput",
 	"description": "Inputs for Translation inference"
 }
diff --git a/packages/tasks/src/tasks/translation/spec/output.json b/packages/tasks/src/tasks/translation/spec/output.json
index 6dcb98077..61b701db2 100644
--- a/packages/tasks/src/tasks/translation/spec/output.json
+++ b/packages/tasks/src/tasks/translation/spec/output.json
@@ -2,5 +2,6 @@
 	"$ref": "/inference/schemas/text2text-generation/output.json",
 	"$id": "/inference/schemas/translation/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "TranslationOutput",
 	"description": "Outputs for Translation inference"
 }
diff --git a/packages/tasks/src/tasks/video-classification/spec/input.json b/packages/tasks/src/tasks/video-classification/spec/input.json
index c05a8b111..386992c9a 100644
--- a/packages/tasks/src/tasks/video-classification/spec/input.json
+++ b/packages/tasks/src/tasks/video-classification/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/video-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Video Classification inference",
+	"title": "VideoClassificationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/video-classification/spec/output.json b/packages/tasks/src/tasks/video-classification/spec/output.json
index 7121e472f..9220cdbae 100644
--- a/packages/tasks/src/tasks/video-classification/spec/output.json
+++ b/packages/tasks/src/tasks/video-classification/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/video-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Video Classification task",
+	"title": "VideoClassificationOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/input.json b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
index 3a54c69fa..b6cb0e123 100644
--- a/packages/tasks/src/tasks/visual-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/visual-question-answering/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Visual Question Answering inference",
+	"title": "VisualQuestionAnsweringInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/output.json b/packages/tasks/src/tasks/visual-question-answering/spec/output.json
index 2005d9f2f..32c9c6c26 100644
--- a/packages/tasks/src/tasks/visual-question-answering/spec/output.json
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/visual-question-answering/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Visual Question Answering task",
+	"title": "VisualQuestionAnsweringOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
index d4d0ba00b..689c22769 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/zero-shot-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Zero Shot Classification inference",
+	"title": "ZeroShotClassificationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
index 54f226d9d..27ad4b00e 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/zero-shot-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Zero Shot Classification task",
+	"title": "ZeroShotClassificationOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
index 44102978e..d5b212918 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/zero-shot-image-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Zero Shot Image Classification inference",
+	"title": "ZeroShotImageClassificationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
index 102944ebc..2b0e78b84 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/zero-shot-image-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Zero Shot Image Classification task",
+	"title": "ZeroShotImageClassificationOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
index 417dc0a78..63dce00ed 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/zero-shot-object-detection/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Zero Shot Object Detection inference",
+	"title": "ZeroShotObjectDetectionInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
index 171e81120..6293efc97 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/zero-shot-object-detection/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Zero Shot Object Detection task",
+	"title": "ZeroShotObjectDetectionOutput",
 	"type": "array",
 	"items": {
 		"type": "object",

From 077a88f65be4136c101e3fb42ff380cb9feb4d20 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 12:37:34 +0100
Subject: [PATCH 36/51] Post-process hack to generate array types

---
 .../tasks/src/scripts/inference-codegen.ts    | 72 ++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index 1988c4c0d..bf4d959ae 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -3,6 +3,7 @@ import { quicktype, InputData, JSONSchemaInput, FetchingJSONSchemaStore } from "
 import * as fs from "fs/promises";
 import { existsSync as pathExists } from "fs";
 import * as path from "path";
+import * as ts from "typescript";
 
 const TYPESCRIPT_HEADER_FILE = `
 /**
@@ -18,7 +19,6 @@ const rootDirFinder = function (): string {
 	let level = parts.length - 1;
 	while (level > 0) {
 		const currentPath = parts.slice(0, level).join("/");
-		console.debug(currentPath);
 		if (pathExists(`${currentPath}/package.json`)) {
 			return path.normalize(currentPath);
 		}
@@ -64,6 +64,71 @@ async function generateTypescript(inputData: InputData): Promise<SerializedRende
 	});
 }
 
+async function postProcessOutput(path2generated: string, outputSpec: Record<string, unknown>): Promise<void> {
+	const source = ts.createSourceFile(
+		path.basename(path2generated),
+		await fs.readFile(path2generated, { encoding: "utf-8" }),
+		ts.ScriptTarget.ES2022
+	);
+	const exportedName = outputSpec.title;
+	if (outputSpec.type !== "array" || typeof exportedName !== "string") {
+		console.log("      Nothing to do");
+		return;
+	}
+	const topLevelNodes = source.getChildAt(0).getChildren();
+	const hasTypeAlias = topLevelNodes.some(
+		(node) =>
+			node.kind === ts.SyntaxKind.TypeAliasDeclaration &&
+			(node as ts.TypeAliasDeclaration).name.escapedText === exportedName
+	);
+	if (hasTypeAlias) {
+		return;
+	}
+
+	const interfaceDeclaration = topLevelNodes.find((node): node is ts.InterfaceDeclaration => {
+		if (node.kind === ts.SyntaxKind.InterfaceDeclaration) {
+			return (node as ts.InterfaceDeclaration).name.getText(source) === exportedName;
+		}
+		return false;
+	});
+	if (!interfaceDeclaration) {
+		console.log("      Nothing to do");
+		return;
+	}
+
+	console.log("      Inserting top-level array type alias...");
+
+	const updatedInterface = ts.factory.updateInterfaceDeclaration(
+		interfaceDeclaration,
+		interfaceDeclaration.modifiers,
+		ts.factory.createIdentifier(interfaceDeclaration.name.getText(source) + "Element"),
+		interfaceDeclaration.typeParameters,
+		interfaceDeclaration.heritageClauses,
+		interfaceDeclaration.members
+	);
+	const arrayDeclaration = ts.factory.createTypeAliasDeclaration(
+		[ts.factory.createModifier(ts.SyntaxKind.ExportKeyword)],
+		exportedName,
+		undefined,
+		ts.factory.createArrayTypeNode(ts.factory.createTypeReferenceNode(updatedInterface.name))
+	);
+
+	const printer = ts.createPrinter();
+
+	const newNodes = ts.factory.createNodeArray([
+		...topLevelNodes.filter((node) => node !== interfaceDeclaration),
+		arrayDeclaration,
+		updatedInterface,
+	]);
+
+	fs.writeFile(path2generated, printer.printList(ts.ListFormat.MultiLine, newNodes, source), {
+		flag: "w+",
+		encoding: "utf-8",
+	});
+
+	return;
+}
+
 async function main() {
 	const rootDir = rootDirFinder();
 	const tasksDir = path.join(rootDir, "src", "tasks");
@@ -96,6 +161,11 @@ async function main() {
 				encoding: "utf-8",
 			});
 		}
+
+		const outputSpec = JSON.parse(await fs.readFile(`${taskSpecDir}/output.json`, { encoding: "utf-8" }));
+
+		console.log("   🩹 Post-processing the generated code");
+		await postProcessOutput(`${dirPath}/inference.ts`, outputSpec);
 	}
 	console.debug("✅ All done!");
 }

From 6b10c4d949112528a23bda474d11523d1095ed16 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 12:38:12 +0100
Subject: [PATCH 37/51] regenerate code

---
 .../tasks/audio-classification/inference.ts   |  7 +-
 .../automatic-speech-recognition/inference.ts |  9 +--
 .../document-question-answering/inference.ts  |  8 +--
 .../tasks/src/tasks/fill-mask/inference.ts    |  6 +-
 .../tasks/image-classification/inference.ts   |  7 +-
 .../src/tasks/image-segmentation/inference.ts |  7 +-
 .../src/tasks/image-to-text/inference.ts      |  6 +-
 .../src/tasks/object-detection/inference.ts   | 29 ++++----
 .../src/tasks/question-answering/inference.ts |  7 +-
 .../table-question-answering/inference.ts     | 14 ++--
 .../tasks/text-classification/inference.ts    |  7 +-
 .../src/tasks/text-generation/inference.ts    |  6 +-
 .../src/tasks/text-to-audio/inference.ts      |  9 +--
 .../tasks/text2text-generation/inference.ts   | 11 ++-
 .../tasks/token-classification/inference.ts   |  7 +-
 .../tasks/video-classification/inference.ts   |  7 +-
 .../visual-question-answering/inference.ts    |  7 +-
 .../zero-shot-classification/inference.ts     |  7 +-
 .../inference.ts                              |  7 +-
 .../zero-shot-object-detection/inference.ts   | 67 -------------------
 20 files changed, 64 insertions(+), 171 deletions(-)

diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index bfc7af54e..6671cdf14 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Audio Classification inference
  */
@@ -18,7 +17,6 @@ export interface AudioClassificationInput {
 	parameters?: AudioClassificationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -35,13 +33,12 @@ export interface AudioClassificationParameters {
 	topK?: number;
 	[property: string]: unknown;
 }
-
 export type AudioClassificationOutputTransform = "sigmoid" | "softmax" | "none";
-
+export type AudioClassificationOutput = AudioClassificationOutputElement[];
 /**
  * Outputs for Audio Classification inference
  */
-export interface AudioClassificationOutput {
+export interface AudioClassificationOutputElement {
 	/**
 	 * The predicted class label (model specific).
 	 */
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index bf594e048..d83c45af5 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Automatic Speech Recognition inference
  */
@@ -15,14 +14,16 @@ export interface AutomaticSpeechRecognitionInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: { [key: string]: unknown };
+	parameters?: {
+		[key: string]: unknown;
+	};
 	[property: string]: unknown;
 }
-
+export type AutomaticSpeechRecognitionOutput = AutomaticSpeechRecognitionOutputElement[];
 /**
  * Outputs of inference for the Automatic Speech Recognition task
  */
-export interface AutomaticSpeechRecognitionOutput {
+export interface AutomaticSpeechRecognitionOutputElement {
 	/**
 	 * The recognized text.
 	 */
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index 4502a8ffb..cd2ab5405 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Document Question Answering inference
  */
@@ -18,7 +17,6 @@ export interface DocumentQuestionAnsweringInput {
 	parameters?: DocumentQuestionAnsweringParameters;
 	[property: string]: unknown;
 }
-
 /**
  * One (document, question) pair to answer
  */
@@ -33,7 +31,6 @@ export interface DocumentQuestionAnsweringInputData {
 	question: string;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -81,13 +78,12 @@ export interface DocumentQuestionAnsweringParameters {
 	wordBoxes?: WordBox[];
 	[property: string]: unknown;
 }
-
 export type WordBox = number[] | string;
-
+export type DocumentQuestionAnsweringOutput = DocumentQuestionAnsweringOutputElement[];
 /**
  * Outputs of inference for the Document Question Answering task
  */
-export interface DocumentQuestionAnsweringOutput {
+export interface DocumentQuestionAnsweringOutputElement {
 	/**
 	 * The answer to the question.
 	 */
diff --git a/packages/tasks/src/tasks/fill-mask/inference.ts b/packages/tasks/src/tasks/fill-mask/inference.ts
index c51ba8ec9..b80383da6 100644
--- a/packages/tasks/src/tasks/fill-mask/inference.ts
+++ b/packages/tasks/src/tasks/fill-mask/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Fill Mask inference
  */
@@ -18,7 +17,6 @@ export interface FillMaskInput {
 	parameters?: FillMaskParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -38,11 +36,11 @@ export interface FillMaskParameters {
 	topK?: number;
 	[property: string]: unknown;
 }
-
+export type FillMaskOutput = FillMaskOutputElement[];
 /**
  * Outputs of inference for the Fill Mask task
  */
-export interface FillMaskOutput {
+export interface FillMaskOutputElement {
 	/**
 	 * The corresponding probability
 	 */
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index de10f4731..5a43acdf5 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Image Classification inference
  */
@@ -18,7 +17,6 @@ export interface ImageClassificationInput {
 	parameters?: ImageClassificationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -35,13 +33,12 @@ export interface ImageClassificationParameters {
 	topK?: number;
 	[property: string]: unknown;
 }
-
 export type ImageClassificationOutputTransform = "sigmoid" | "softmax" | "none";
-
+export type ImageClassificationOutput = ImageClassificationOutputElement[];
 /**
  * Outputs of inference for the Image Classification task
  */
-export interface ImageClassificationOutput {
+export interface ImageClassificationOutputElement {
 	/**
 	 * The predicted class label (model specific).
 	 */
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index 366c998f3..b316715f5 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Image Segmentation inference
  */
@@ -18,7 +17,6 @@ export interface ImageSegmentationInput {
 	parameters?: ImageSegmentationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -43,15 +41,14 @@ export interface ImageSegmentationParameters {
 	threshold?: number;
 	[property: string]: unknown;
 }
-
 export type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
-
+export type ImageSegmentationOutput = ImageSegmentationOutputElement[];
 /**
  * Outputs of inference for the Image Segmentation task
  *
  * A predicted mask / segment
  */
-export interface ImageSegmentationOutput {
+export interface ImageSegmentationOutputElement {
 	/**
 	 * The label of the predicted segment
 	 */
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index 210b2d878..cba745139 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Image To Text inference
  */
@@ -18,7 +17,6 @@ export interface ImageToTextInput {
 	parameters?: ImageToTextParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -31,11 +29,11 @@ export interface ImageToTextParameters {
 	maxNewTokens?: number;
 	[property: string]: unknown;
 }
-
+export type ImageToTextOutput = ImageToTextOutputElement[];
 /**
  * Outputs of inference for the Image To Text task
  */
-export interface ImageToTextOutput {
+export interface ImageToTextOutputElement {
 	/**
 	 * The generated text.
 	 */
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index f432d2cba..9650c781e 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Object Detection inference
  */
@@ -18,7 +17,6 @@ export interface ObjectDetectionInput {
 	parameters?: ObjectDetectionParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -31,11 +29,22 @@ export interface ObjectDetectionParameters {
 	threshold?: number;
 	[property: string]: unknown;
 }
-
+/**
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
+ * image.
+ */
+export interface BoundingBox {
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: unknown;
+}
+export type ObjectDetectionOutput = ObjectDetectionOutputElement[];
 /**
  * Outputs of inference for the Object Detection task
  */
-export interface ObjectDetectionOutput {
+export interface ObjectDetectionOutputElement {
 	/**
 	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
 	 * image.
@@ -51,15 +60,3 @@ export interface ObjectDetectionOutput {
 	score: number;
 	[property: string]: unknown;
 }
-
-/**
- * The predicted bounding box. Coordinates are relative to the top left corner of the input
- * image.
- */
-export interface BoundingBox {
-	xmax: number;
-	xmin: number;
-	ymax: number;
-	ymin: number;
-	[property: string]: unknown;
-}
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
index 1895b1dd4..bffc71cc6 100644
--- a/packages/tasks/src/tasks/question-answering/inference.ts
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Question Answering inference
  */
@@ -18,7 +17,6 @@ export interface QuestionAnsweringInput {
 	parameters?: QuestionAnsweringParameters;
 	[property: string]: unknown;
 }
-
 /**
  * One (context, question) pair to answer
  */
@@ -33,7 +31,6 @@ export interface QuestionAnsweringInputData {
 	question: string;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -77,11 +74,11 @@ export interface QuestionAnsweringParameters {
 	topK?: number;
 	[property: string]: unknown;
 }
-
+export type QuestionAnsweringOutput = QuestionAnsweringOutputElement[];
 /**
  * Outputs of inference for the Question Answering task
  */
-export interface QuestionAnsweringOutput {
+export interface QuestionAnsweringOutputElement {
 	/**
 	 * The answer to the question.
 	 */
diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
index 836aab94d..ac04c6a32 100644
--- a/packages/tasks/src/tasks/table-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Table Question Answering inference
  */
@@ -15,10 +14,11 @@ export interface TableQuestionAnsweringInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: { [key: string]: unknown };
+	parameters?: {
+		[key: string]: unknown;
+	};
 	[property: string]: unknown;
 }
-
 /**
  * One (table, question) pair to answer
  */
@@ -30,14 +30,16 @@ export interface TableQuestionAnsweringInputData {
 	/**
 	 * The table to serve as context for the questions
 	 */
-	table: { [key: string]: unknown };
+	table: {
+		[key: string]: unknown;
+	};
 	[property: string]: unknown;
 }
-
+export type TableQuestionAnsweringOutput = TableQuestionAnsweringOutputElement[];
 /**
  * Outputs of inference for the Table Question Answering task
  */
-export interface TableQuestionAnsweringOutput {
+export interface TableQuestionAnsweringOutputElement {
 	/**
 	 * If the model has an aggregator, this returns the aggregator.
 	 */
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
index 5f4f466a0..19298ccd0 100644
--- a/packages/tasks/src/tasks/text-classification/inference.ts
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Text Classification inference
  */
@@ -18,7 +17,6 @@ export interface TextClassificationInput {
 	parameters?: TextClassificationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -35,13 +33,12 @@ export interface TextClassificationParameters {
 	topK?: number;
 	[property: string]: unknown;
 }
-
 export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
-
+export type TextClassificationOutput = TextClassificationOutputElement[];
 /**
  * Outputs of inference for the Text Classification task
  */
-export interface TextClassificationOutput {
+export interface TextClassificationOutputElement {
 	/**
 	 * The predicted class label (model specific).
 	 */
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index 13a09ff28..94279336c 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Text Generation inference
  */
@@ -18,7 +17,6 @@ export interface TextGenerationInput {
 	parameters?: TextGenerationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -74,11 +72,11 @@ export interface TextGenerationParameters {
 	watermark?: boolean;
 	[property: string]: unknown;
 }
-
+export type TextGenerationOutput = TextGenerationOutputElement[];
 /**
  * Outputs for Text Generation inference
  */
-export interface TextGenerationOutput {
+export interface TextGenerationOutputElement {
 	/**
 	 * The generated text
 	 */
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
index be2a70bfd..d6a05e017 100644
--- a/packages/tasks/src/tasks/text-to-audio/inference.ts
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Text To Audio inference
  */
@@ -15,14 +14,16 @@ export interface TextToAudioInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: { [key: string]: unknown };
+	parameters?: {
+		[key: string]: unknown;
+	};
 	[property: string]: unknown;
 }
-
+export type TextToAudioOutput = TextToAudioOutputElement[];
 /**
  * Outputs of inference for the Text To Audio task
  */
-export interface TextToAudioOutput {
+export interface TextToAudioOutputElement {
 	/**
 	 * The generated audio waveform.
 	 */
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index 81c160e27..788845dd2 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Text2text Generation inference
  */
@@ -18,7 +17,6 @@ export interface Text2TextGenerationInput {
 	parameters?: Text2TextGenerationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -32,20 +30,21 @@ export interface Text2TextGenerationParameters {
 	/**
 	 * Additional parametrization of the text generation algorithm
 	 */
-	generateParameters?: { [key: string]: unknown };
+	generateParameters?: {
+		[key: string]: unknown;
+	};
 	/**
 	 * The truncation strategy to use
 	 */
 	truncation?: Text2TextGenerationTruncationStrategy;
 	[property: string]: unknown;
 }
-
 export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
-
+export type Text2TextGenerationOutput = Text2TextGenerationOutputElement[];
 /**
  * Outputs of inference for the Text2text Generation task
  */
-export interface Text2TextGenerationOutput {
+export interface Text2TextGenerationOutputElement {
 	/**
 	 * The generated text.
 	 */
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
index 4584ca51d..7a8da8dcf 100644
--- a/packages/tasks/src/tasks/token-classification/inference.ts
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Token Classification inference
  */
@@ -18,7 +17,6 @@ export interface TokenClassificationInput {
 	parameters?: TokenClassificationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -39,7 +37,6 @@ export interface TokenClassificationParameters {
 	stride?: number;
 	[property: string]: unknown;
 }
-
 /**
  * Do not aggregate tokens
  *
@@ -55,11 +52,11 @@ export interface TokenClassificationParameters {
  * across the word's tokens).
  */
 export type TokenClassificationAggregationStrategy = "none" | "simple" | "first" | "average" | "max";
-
+export type TokenClassificationOutput = TokenClassificationOutputElement[];
 /**
  * Outputs of inference for the Token Classification task
  */
-export interface TokenClassificationOutput {
+export interface TokenClassificationOutputElement {
 	/**
 	 * The character position in the input where this group ends.
 	 */
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
index 1914bfda6..ede6a25e4 100644
--- a/packages/tasks/src/tasks/video-classification/inference.ts
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Video Classification inference
  */
@@ -18,7 +17,6 @@ export interface VideoClassificationInput {
 	parameters?: VideoClassificationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -43,13 +41,12 @@ export interface VideoClassificationParameters {
 	topK?: number;
 	[property: string]: unknown;
 }
-
 export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
-
+export type VideoClassificationOutput = VideoClassificationOutputElement[];
 /**
  * Outputs of inference for the Video Classification task
  */
-export interface VideoClassificationOutput {
+export interface VideoClassificationOutputElement {
 	/**
 	 * The predicted class label (model specific).
 	 */
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
index 0b0ee2e5a..0eb513ebf 100644
--- a/packages/tasks/src/tasks/visual-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Visual Question Answering inference
  */
@@ -18,7 +17,6 @@ export interface VisualQuestionAnsweringInput {
 	parameters?: VisualQuestionAnsweringParameters;
 	[property: string]: unknown;
 }
-
 /**
  * One (image, question) pair to answer
  */
@@ -33,7 +31,6 @@ export interface VisualQuestionAnsweringInputData {
 	question: unknown;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -48,11 +45,11 @@ export interface VisualQuestionAnsweringParameters {
 	topK?: number;
 	[property: string]: unknown;
 }
-
+export type VisualQuestionAnsweringOutput = VisualQuestionAnsweringOutputElement[];
 /**
  * Outputs of inference for the Visual Question Answering task
  */
-export interface VisualQuestionAnsweringOutput {
+export interface VisualQuestionAnsweringOutputElement {
 	/**
 	 * The answer to the question
 	 */
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
index 369474a6d..db7f0c8bb 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Zero Shot Classification inference
  */
@@ -18,7 +17,6 @@ export interface ZeroShotClassificationInput {
 	parameters?: ZeroShotClassificationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * The input text data, with candidate labels
  */
@@ -33,7 +31,6 @@ export interface ZeroShotClassificationInputData {
 	text: string;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -53,11 +50,11 @@ export interface ZeroShotClassificationParameters {
 	multiLabel?: boolean;
 	[property: string]: unknown;
 }
-
+export type ZeroShotClassificationOutput = ZeroShotClassificationOutputElement[];
 /**
  * Outputs of inference for the Zero Shot Classification task
  */
-export interface ZeroShotClassificationOutput {
+export interface ZeroShotClassificationOutputElement {
 	/**
 	 * A candidate label
 	 */
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 65649ff5a..22308aabb 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Zero Shot Image Classification inference
  */
@@ -18,7 +17,6 @@ export interface ZeroShotImageClassificationInput {
 	parameters?: ZeroShotImageClassificationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * The input image data, with candidate labels
  */
@@ -33,7 +31,6 @@ export interface ZeroShotImageClassificationInputData {
 	image: unknown;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -47,11 +44,11 @@ export interface ZeroShotImageClassificationParameters {
 	hypothesisTemplate?: string;
 	[property: string]: unknown;
 }
-
+export type ZeroShotImageClassificationOutput = ZeroShotImageClassificationOutputElement[];
 /**
  * Outputs of inference for the Zero Shot Image Classification task
  */
-export interface ZeroShotImageClassificationOutput {
+export interface ZeroShotImageClassificationOutputElement {
 	/**
 	 * A candidate label
 	 */
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index 987662e24..e69de29bb 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -1,67 +0,0 @@
-/**
- * Inference code generated from the JSON schema spec in ./spec
- *
- * Using src/scripts/inference-codegen
- */
-
-/**
- * Inputs for Zero Shot Object Detection inference
- */
-export interface ZeroShotObjectDetectionInput {
-	/**
-	 * The input image data, with candidate labels
-	 */
-	data: ZeroShotObjectDetectionInputData;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: unknown };
-	[property: string]: unknown;
-}
-
-/**
- * The input image data, with candidate labels
- */
-export interface ZeroShotObjectDetectionInputData {
-	/**
-	 * The candidate labels for this image
-	 */
-	candidateLabels: string[];
-	/**
-	 * The image data to generate bounding boxes from
-	 */
-	image: unknown;
-	[property: string]: unknown;
-}
-
-/**
- * Outputs of inference for the Zero Shot Object Detection task
- */
-export interface ZeroShotObjectDetectionOutput {
-	/**
-	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
-	 * image.
-	 */
-	box: BoundingBox;
-	/**
-	 * A candidate label
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: unknown;
-}
-
-/**
- * The predicted bounding box. Coordinates are relative to the top left corner of the input
- * image.
- */
-export interface BoundingBox {
-	xmax: number;
-	xmin: number;
-	ymax: number;
-	ymin: number;
-	[property: string]: unknown;
-}

From c35fe85d22159a803525c7740144d2418c3f0b32 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 12:55:09 +0100
Subject: [PATCH 38/51] =?UTF-8?q?e=F0=9F=93=9D=20Some=20comments?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/src/scripts/inference-codegen.ts | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index bf4d959ae..ff26c128c 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -63,6 +63,17 @@ async function generateTypescript(inputData: InputData): Promise<SerializedRende
 		},
 	});
 }
+/**
+ * quicktype is unable to generate "top-level array types" that are defined in the output spec: https://github.com/glideapps/quicktype/issues/2481
+ * We have to use the TypeScript API to generate those types when required.
+ * This hacky function:
+ *   - looks for the generated interface for output types
+ *   - renames it with a `Element` suffix
+ *   - generates  type alias in the form `export type <OutputType> = <OutputType>Element[];
+ * 
+ * And writes that to the `inference.ts` file
+ *   
+ */
 
 async function postProcessOutput(path2generated: string, outputSpec: Record<string, unknown>): Promise<void> {
 	const source = ts.createSourceFile(

From 6f1a8b36af5ca1c626c678b9465149390958532a Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 12:58:21 +0100
Subject: [PATCH 39/51] =?UTF-8?q?=F0=9F=92=84=20Lint?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/src/scripts/inference-codegen.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index ff26c128c..c66d87b25 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -70,9 +70,9 @@ async function generateTypescript(inputData: InputData): Promise<SerializedRende
  *   - looks for the generated interface for output types
  *   - renames it with a `Element` suffix
  *   - generates  type alias in the form `export type <OutputType> = <OutputType>Element[];
- * 
+ *
  * And writes that to the `inference.ts` file
- *   
+ *
  */
 
 async function postProcessOutput(path2generated: string, outputSpec: Record<string, unknown>): Promise<void> {

From 9d25d281e911e1a20b830b3757ea5cd46ce29661 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 15:42:47 +0100
Subject: [PATCH 40/51] Add text-to-image pipeline

---
 .../src/tasks/text-to-image/inference.ts      | 69 +++++++++++++++++++
 .../src/tasks/text-to-image/spec/input.json   | 53 ++++++++++++++
 .../src/tasks/text-to-image/spec/output.json  | 15 ++++
 3 files changed, 137 insertions(+)
 create mode 100644 packages/tasks/src/tasks/text-to-image/inference.ts
 create mode 100644 packages/tasks/src/tasks/text-to-image/spec/input.json
 create mode 100644 packages/tasks/src/tasks/text-to-image/spec/output.json

diff --git a/packages/tasks/src/tasks/text-to-image/inference.ts b/packages/tasks/src/tasks/text-to-image/inference.ts
new file mode 100644
index 000000000..bfe0433f3
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-image/inference.ts
@@ -0,0 +1,69 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Text To Image inference
+ */
+export interface TextToImageInput {
+	/**
+	 * The input text data (sometimes called "prompt"
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextToImageParameters;
+	[property: string]: unknown;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text To Image
+ */
+export interface TextToImageParameters {
+	/**
+	 * For diffusion models. A higher guidance scale value encourages the model to generate
+	 * images closely linked to the text prompt at the expense of lower image quality.
+	 */
+	guidanceScale?: number;
+	/**
+	 * One or several prompt to guide what NOT to include in image generation.
+	 */
+	negativePrompt?: string[];
+	/**
+	 * For diffusion models. The number of denoising steps. More denoising steps usually lead to
+	 * a higher quality image at the expense of slower inference.
+	 */
+	numInferenceSteps?: number;
+	/**
+	 * The size in pixel of the output image
+	 */
+	targetSize?: TargetSize;
+	[property: string]: unknown;
+}
+
+/**
+ * The size in pixel of the output image
+ */
+export interface TargetSize {
+	height: number;
+	width: number;
+	[property: string]: unknown;
+}
+
+/**
+ * Outputs of inference for the Text To Image task
+ */
+export type TextToImageOutput = unknown[] | boolean | number | number | null | TextToImageOutputObject | string;
+
+export interface TextToImageOutputObject {
+	/**
+	 * The generated image
+	 */
+	image: unknown;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/text-to-image/spec/input.json b/packages/tasks/src/tasks/text-to-image/spec/input.json
new file mode 100644
index 000000000..32a076dd0
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-image/spec/input.json
@@ -0,0 +1,53 @@
+{
+	"$id": "/inference/schemas/text-to-image/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text To Image inference",
+	"title": "TextToImageInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input text data (sometimes called \"prompt\"",
+			"type": "string"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/TextToImageParameters"
+		}
+	},
+	"$defs": {
+		"TextToImageParameters": {
+			"title": "TextToImageParameters",
+			"description": "Additional inference parameters for Text To Image",
+			"type": "object",
+			"properties": {
+				"guidanceScale": {
+					"type": "number",
+					"description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
+				},
+				"negativePrompt": {
+					"type": "array",
+					"items": { "type": "string" },
+					"description": "One or several prompt to guide what NOT to include in image generation."
+				},
+				"numInferenceSteps": {
+					"type": "integer",
+					"description": "For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
+				},
+				"targetSize": {
+					"type": "object",
+					"description": "The size in pixel of the output image",
+					"properties": {
+						"width": {
+							"type": "integer"
+						},
+						"height": {
+							"type": "integer"
+						}
+					},
+					"required": ["width", "height"]
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/text-to-image/spec/output.json b/packages/tasks/src/tasks/text-to-image/spec/output.json
new file mode 100644
index 000000000..5ab3ee787
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-image/spec/output.json
@@ -0,0 +1,15 @@
+{
+	"$id": "/inference/schemas/text-to-image/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text To Image task",
+	"title": "TextToImageOutput",
+	"type": "array",
+	"items": {
+		"properties": {
+			"image": {
+				"description": "The generated image"
+			}
+		},
+		"required": ["image"]
+	}
+}

From 499ed5f7f3f182e3178671e9327222ecdfdb60d4 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 15:44:42 +0100
Subject: [PATCH 41/51] Update image-to-image output

---
 packages/tasks/src/tasks/image-to-image/spec/output.json | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/packages/tasks/src/tasks/image-to-image/spec/output.json b/packages/tasks/src/tasks/image-to-image/spec/output.json
index 5e55f5677..af4eff804 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/output.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/output.json
@@ -3,8 +3,10 @@
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image To Image task",
 	"title": "ImageToImageOutput",
-	"type": "array",
-	"items": {
-		"description": "The output image"
+	"type": "object",
+	"properties": {
+		"image": {
+			"description": "The output image"
+		}
 	}
 }

From bf48f5e3426852499d3ec746911ae6bc5e291210 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 15:51:34 +0100
Subject: [PATCH 42/51] Update image-to-image inputs

---
 .../src/tasks/image-to-image/inference.ts     | 51 +++++++++++++++++--
 .../src/tasks/image-to-image/spec/input.json  | 31 ++++++++++-
 2 files changed, 78 insertions(+), 4 deletions(-)

diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
index f05e24b6e..96a532b25 100644
--- a/packages/tasks/src/tasks/image-to-image/inference.ts
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -4,8 +4,6 @@
  * Using src/scripts/inference-codegen
  */
 
-export type ImageToImageOutput = unknown[];
-
 /**
  * Inputs for Image To Image inference
  */
@@ -17,6 +15,53 @@ export interface ImageToImageInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: unknown;
+	parameters?: ImageToImageParameters;
+	[property: string]: unknown;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image To Image
+ */
+export interface ImageToImageParameters {
+	/**
+	 * For diffusion models. A higher guidance scale value encourages the model to generate
+	 * images closely linked to the text prompt at the expense of lower image quality.
+	 */
+	guidanceScale?: number;
+	/**
+	 * One or several prompt to guide what NOT to include in image generation.
+	 */
+	negativePrompt?: string[];
+	/**
+	 * For diffusion models. The number of denoising steps. More denoising steps usually lead to
+	 * a higher quality image at the expense of slower inference.
+	 */
+	numInferenceSteps?: number;
+	/**
+	 * The size in pixel of the output image
+	 */
+	targetSize?: TargetSize;
+	[property: string]: unknown;
+}
+
+/**
+ * The size in pixel of the output image
+ */
+export interface TargetSize {
+	height: number;
+	width: number;
+	[property: string]: unknown;
+}
+
+/**
+ * Outputs of inference for the Image To Image task
+ */
+export interface ImageToImageOutput {
+	/**
+	 * The output image
+	 */
+	image?: unknown;
 	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-image/spec/input.json b/packages/tasks/src/tasks/image-to-image/spec/input.json
index d91d6e6d4..11d4bee8a 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/input.json
@@ -16,7 +16,36 @@
 	"$defs": {
 		"ImageToImageParameters": {
 			"title": "ImageToImageParameters",
-			"description": "Additional inference parameters for Image To Image"
+			"description": "Additional inference parameters for Image To Image",
+			"type": "object",
+			"properties": {
+				"guidanceScale": {
+					"type": "number",
+					"description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
+				},
+				"negativePrompt": {
+					"type": "array",
+					"items": { "type": "string" },
+					"description": "One or several prompt to guide what NOT to include in image generation."
+				},
+				"numInferenceSteps": {
+					"type": "integer",
+					"description": "For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
+				},
+				"targetSize": {
+					"type": "object",
+					"description": "The size in pixel of the output image",
+					"properties": {
+						"width": {
+							"type": "integer"
+						},
+						"height": {
+							"type": "integer"
+						}
+					},
+					"required": ["width", "height"]
+				}
+			}
 		}
 	},
 	"required": ["data"]

From 49a815101e22c4a8429934252cefe07e5517263a Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 16:34:17 +0100
Subject: [PATCH 43/51] Factorize generate parameters

---
 .../tasks/src/scripts/inference-codegen.ts    | 10 +++---
 .../automatic-speech-recognition/inference.ts | 32 +++++++++++++++++--
 .../spec/input.json                           | 11 ++++++-
 .../src/tasks/image-to-text/inference.ts      | 16 ++++++++++
 .../src/tasks/image-to-text/spec/input.json   |  4 +++
 packages/tasks/src/tasks/schema-utils.json    | 18 +++++++++++
 .../src/tasks/text-to-audio/inference.ts      | 28 ++++++++++++++--
 .../src/tasks/text-to-audio/spec/input.json   |  7 +++-
 .../src/tasks/text-to-speech/inference.ts     | 28 +++++++++++++++-
 9 files changed, 141 insertions(+), 13 deletions(-)
 create mode 100644 packages/tasks/src/tasks/schema-utils.json

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index c66d87b25..ac72ff9f7 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -74,7 +74,6 @@ async function generateTypescript(inputData: InputData): Promise<SerializedRende
  * And writes that to the `inference.ts` file
  *
  */
-
 async function postProcessOutput(path2generated: string, outputSpec: Record<string, unknown>): Promise<void> {
 	const source = ts.createSourceFile(
 		path.basename(path2generated),
@@ -149,9 +148,12 @@ async function main() {
 			.filter((entry) => entry.name !== "placeholder")
 			.map(async (entry) => ({ task: entry.name, dirPath: path.join(entry.path, entry.name) }))
 	);
-	const allSpecFiles = allTasks
-		.flatMap(({ dirPath }) => [path.join(dirPath, "spec", "input.json"), path.join(dirPath, "spec", "output.json")])
-		.filter((filepath) => pathExists(filepath));
+	const allSpecFiles = [
+		path.join(tasksDir, "schema-utils.json"),
+		...allTasks
+			.flatMap(({ dirPath }) => [path.join(dirPath, "spec", "input.json"), path.join(dirPath, "spec", "output.json")])
+			.filter((filepath) => pathExists(filepath)),
+	];
 
 	for (const { task, dirPath } of allTasks) {
 		const taskSpecDir = path.join(dirPath, "spec");
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index d83c45af5..244b44b69 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -14,9 +14,35 @@ export interface AutomaticSpeechRecognitionInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: {
-		[key: string]: unknown;
-	};
+	parameters?: AutomaticSpeechRecognitionParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Automatic Speech Recognition
+ */
+export interface AutomaticSpeechRecognitionParameters {
+	/**
+	 * Parametrization of the text generation process
+	 */
+	generate?: GenerationParameters;
+	/**
+	 * Whether to output corresponding timestamps with the generated text
+	 */
+	returnTimestamps?: boolean;
+	[property: string]: unknown;
+}
+/**
+ * Parametrization of the text generation process
+ *
+ * Ad-hoc parametrization of the text generation process
+ */
+export interface GenerationParameters {
+	/**
+	 * I can be the papa you'd be the mama
+	 */
+	temperature?: number;
 	[property: string]: unknown;
 }
 export type AutomaticSpeechRecognitionOutput = AutomaticSpeechRecognitionOutputElement[];
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
index f44075d56..93621151e 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -18,7 +18,16 @@
 			"title": "AutomaticSpeechRecognitionParameters",
 			"description": "Additional inference parameters for Automatic Speech Recognition",
 			"type": "object",
-			"properties": {}
+			"properties": {
+				"returnTimestamps": {
+					"type": "boolean",
+					"description": "Whether to output corresponding timestamps with the generated text"
+				},
+				"generate": {
+					"description": "Parametrization of the text generation process",
+					"$ref": "/inference/schemas/schema-utils.json#/definitions/GenerationParameters"
+				}
+			}
 		}
 	},
 	"required": ["data"]
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index cba745139..c87a51ce3 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -23,12 +23,28 @@ export interface ImageToTextInput {
  * Additional inference parameters for Image To Text
  */
 export interface ImageToTextParameters {
+	/**
+	 * Parametrization of the text generation process
+	 */
+	generate?: GenerationParameters;
 	/**
 	 * The amount of maximum tokens to generate.
 	 */
 	maxNewTokens?: number;
 	[property: string]: unknown;
 }
+/**
+ * Parametrization of the text generation process
+ *
+ * Ad-hoc parametrization of the text generation process
+ */
+export interface GenerationParameters {
+	/**
+	 * I can be the papa you'd be the mama
+	 */
+	temperature?: number;
+	[property: string]: unknown;
+}
 export type ImageToTextOutput = ImageToTextOutputElement[];
 /**
  * Outputs of inference for the Image To Text task
diff --git a/packages/tasks/src/tasks/image-to-text/spec/input.json b/packages/tasks/src/tasks/image-to-text/spec/input.json
index f06eb59f0..b074372fc 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/input.json
@@ -22,6 +22,10 @@
 				"maxNewTokens": {
 					"type": "integer",
 					"description": "The amount of maximum tokens to generate."
+				},
+				"generate": {
+					"description": "Parametrization of the text generation process",
+					"$ref": "/inference/schemas/schema-utils.json#/definitions/GenerationParameters"
 				}
 			}
 		}
diff --git a/packages/tasks/src/tasks/schema-utils.json b/packages/tasks/src/tasks/schema-utils.json
new file mode 100644
index 000000000..60c833f60
--- /dev/null
+++ b/packages/tasks/src/tasks/schema-utils.json
@@ -0,0 +1,18 @@
+{
+	"$id": "/inference/schemas/schema-utils.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Common type definitions shared by several tasks",
+	"definitions": {
+		"GenerationParameters": {
+			"title": "GenerationParameters",
+			"description": "Ad-hoc parametrization of the text generation process",
+			"type": "object",
+			"properties": {
+				"temperature": {
+					"type": "number",
+					"description": "I can be the papa you'd be the mama"
+				}
+			}
+		}
+	}
+}
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
index d6a05e017..41796240a 100644
--- a/packages/tasks/src/tasks/text-to-audio/inference.ts
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -14,9 +14,31 @@ export interface TextToAudioInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: {
-		[key: string]: unknown;
-	};
+	parameters?: TextToAudioParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text To Audio
+ */
+export interface TextToAudioParameters {
+	/**
+	 * Parametrization of the text generation process
+	 */
+	generate?: GenerationParameters;
+	[property: string]: unknown;
+}
+/**
+ * Parametrization of the text generation process
+ *
+ * Ad-hoc parametrization of the text generation process
+ */
+export interface GenerationParameters {
+	/**
+	 * I can be the papa you'd be the mama
+	 */
+	temperature?: number;
 	[property: string]: unknown;
 }
 export type TextToAudioOutput = TextToAudioOutputElement[];
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/input.json b/packages/tasks/src/tasks/text-to-audio/spec/input.json
index 176060962..d049fb02e 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/input.json
@@ -19,7 +19,12 @@
 			"title": "TextToAudioParameters",
 			"description": "Additional inference parameters for Text To Audio",
 			"type": "object",
-			"properties": {}
+			"properties": {
+				"generate": {
+					"description": "Parametrization of the text generation process",
+					"$ref": "/inference/schemas/schema-utils.json#/definitions/GenerationParameters"
+				}
+			}
 		}
 	},
 	"required": ["data"]
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index f119bc62f..a89974072 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -17,7 +17,33 @@ export interface TextToSpeechInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: { [key: string]: unknown };
+	parameters?: TextToAudioParameters;
+	[property: string]: unknown;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text To Audio
+ */
+export interface TextToAudioParameters {
+	/**
+	 * Parametrization of the text generation process
+	 */
+	generate?: GenerationParameters;
+	[property: string]: unknown;
+}
+
+/**
+ * Parametrization of the text generation process
+ *
+ * Ad-hoc parametrization of the text generation process
+ */
+export interface GenerationParameters {
+	/**
+	 * I can be the papa you'd be the mama
+	 */
+	temperature?: number;
 	[property: string]: unknown;
 }
 

From e4f3d138493a59bb76a6d7371913c3768b48f1eb Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 16:42:14 +0100
Subject: [PATCH 44/51] Correclty type ASR output

---
 .../automatic-speech-recognition/inference.ts | 16 ++++++++++++++++
 .../spec/output.json                          | 19 +++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index 244b44b69..ee17e64f4 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -45,11 +45,27 @@ export interface GenerationParameters {
 	temperature?: number;
 	[property: string]: unknown;
 }
+export interface AutomaticSpeechRecognitionOutputChunk {
+	/**
+	 * A chunk of text identified by the model
+	 */
+	text: string;
+	/**
+	 * The start and end timestamps corresponding with the text
+	 */
+	timestamps: number[];
+	[property: string]: unknown;
+}
 export type AutomaticSpeechRecognitionOutput = AutomaticSpeechRecognitionOutputElement[];
 /**
  * Outputs of inference for the Automatic Speech Recognition task
  */
 export interface AutomaticSpeechRecognitionOutputElement {
+	/**
+	 * When returnTimestamps is enabled, chunks contains a list of audio chunks identified by
+	 * the model.
+	 */
+	chunks?: AutomaticSpeechRecognitionOutputChunk[];
 	/**
 	 * The recognized text.
 	 */
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
index 72573986d..217f210b1 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
@@ -10,6 +10,25 @@
 			"text": {
 				"type": "string",
 				"description": "The recognized text."
+			},
+			"chunks": {
+				"type": "array",
+				"description": "When returnTimestamps is enabled, chunks contains a list of audio chunks identified by the model.",
+				"items": {
+					"type": "object",
+					"title": "AutomaticSpeechRecognitionOutputChunk",
+					"properties": {
+						"text": { "type": "string", "description": "A chunk of text identified by the model" },
+						"timestamps": {
+							"type": "array",
+							"description": "The start and end timestamps corresponding with the text",
+							"items": { "type": "number" },
+							"minLength": 2,
+							"maxLength": 2
+						}
+					},
+					"required": ["text", "timestamps"]
+				}
 			}
 		},
 		"required": ["text"]

From 826181a63831cbe92c97a4e23772fcb9d47967f6 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 17:03:24 +0100
Subject: [PATCH 45/51] wip: spec generate parameters

---
 .../automatic-speech-recognition/inference.ts | 82 +++++++++++++++++-
 .../src/tasks/image-to-text/inference.ts      | 82 +++++++++++++++++-
 packages/tasks/src/tasks/schema-utils.json    | 64 +++++++++++++-
 .../src/tasks/text-to-audio/inference.ts      | 82 +++++++++++++++++-
 .../src/tasks/text-to-speech/inference.ts     | 83 ++++++++++++++++++-
 5 files changed, 387 insertions(+), 6 deletions(-)

diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index ee17e64f4..d9e2adc85 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -40,11 +40,91 @@ export interface AutomaticSpeechRecognitionParameters {
  */
 export interface GenerationParameters {
 	/**
-	 * I can be the papa you'd be the mama
+	 * Whether to use sampling instead of greedy decoding when generating new tokens.
+	 */
+	doSample?: boolean;
+	/**
+	 * Controls the stopping condition for beam-based methods.
+	 */
+	earlyStopping?: EarlyStoppingUnion;
+	/**
+	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
+	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+	 */
+	epsilonCutoff?: number;
+	/**
+	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+	 * float strictly between 0 and 1, a token is only considered if it is greater than either
+	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+	 * for more details.
+	 */
+	etaCutoff?: number;
+	/**
+	 * The maximum length (in tokens) of the generated text, including the input.
+	 */
+	maxLength?: number;
+	/**
+	 * The maximum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The minimum length (in tokens) of the generated text, including the input.
+	 */
+	minLength?: number;
+	/**
+	 * The minimum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	minNewTokens?: number;
+	/**
+	 * Number of groups to divide num_beams into in order to ensure diversity among different
+	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+	 */
+	numBeamGroups?: number;
+	/**
+	 * Number of beams to use for beam search.
+	 */
+	numBeams?: number;
+	/**
+	 * The value balances the model confidence and the degeneration penalty in contrastive
+	 * search decoding.
+	 */
+	penaltyAlpha?: number;
+	/**
+	 * The value used to modulate the next token probabilities.
 	 */
 	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
+	 * that add up to top_p or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Local typicality measures how similar the conditional probability of predicting a target
+	 * token next is to the expected conditional probability of predicting a random token next,
+	 * given the partial text already generated. If set to float < 1, the smallest set of the
+	 * most locally typical tokens with probabilities that add up to typical_p or higher are
+	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+	 */
+	typicalP?: number;
+	/**
+	 * Whether the model should use the past last key/values attentions to speed up decoding
+	 */
+	useCache?: boolean;
 	[property: string]: unknown;
 }
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+export type EarlyStoppingUnion = boolean | "never";
 export interface AutomaticSpeechRecognitionOutputChunk {
 	/**
 	 * A chunk of text identified by the model
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index c87a51ce3..19bb147e2 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -40,11 +40,91 @@ export interface ImageToTextParameters {
  */
 export interface GenerationParameters {
 	/**
-	 * I can be the papa you'd be the mama
+	 * Whether to use sampling instead of greedy decoding when generating new tokens.
+	 */
+	doSample?: boolean;
+	/**
+	 * Controls the stopping condition for beam-based methods.
+	 */
+	earlyStopping?: EarlyStoppingUnion;
+	/**
+	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
+	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+	 */
+	epsilonCutoff?: number;
+	/**
+	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+	 * float strictly between 0 and 1, a token is only considered if it is greater than either
+	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+	 * for more details.
+	 */
+	etaCutoff?: number;
+	/**
+	 * The maximum length (in tokens) of the generated text, including the input.
+	 */
+	maxLength?: number;
+	/**
+	 * The maximum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The minimum length (in tokens) of the generated text, including the input.
+	 */
+	minLength?: number;
+	/**
+	 * The minimum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	minNewTokens?: number;
+	/**
+	 * Number of groups to divide num_beams into in order to ensure diversity among different
+	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+	 */
+	numBeamGroups?: number;
+	/**
+	 * Number of beams to use for beam search.
+	 */
+	numBeams?: number;
+	/**
+	 * The value balances the model confidence and the degeneration penalty in contrastive
+	 * search decoding.
+	 */
+	penaltyAlpha?: number;
+	/**
+	 * The value used to modulate the next token probabilities.
 	 */
 	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
+	 * that add up to top_p or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Local typicality measures how similar the conditional probability of predicting a target
+	 * token next is to the expected conditional probability of predicting a random token next,
+	 * given the partial text already generated. If set to float < 1, the smallest set of the
+	 * most locally typical tokens with probabilities that add up to typical_p or higher are
+	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+	 */
+	typicalP?: number;
+	/**
+	 * Whether the model should use the past last key/values attentions to speed up decoding
+	 */
+	useCache?: boolean;
 	[property: string]: unknown;
 }
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+export type EarlyStoppingUnion = boolean | "never";
 export type ImageToTextOutput = ImageToTextOutputElement[];
 /**
  * Outputs of inference for the Image To Text task
diff --git a/packages/tasks/src/tasks/schema-utils.json b/packages/tasks/src/tasks/schema-utils.json
index 60c833f60..5a3d3e812 100644
--- a/packages/tasks/src/tasks/schema-utils.json
+++ b/packages/tasks/src/tasks/schema-utils.json
@@ -1,7 +1,7 @@
 {
 	"$id": "/inference/schemas/schema-utils.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "Common type definitions shared by several tasks",
+	"description": "(Incomplete!) Common type definitions shared by several tasks",
 	"definitions": {
 		"GenerationParameters": {
 			"title": "GenerationParameters",
@@ -10,7 +10,67 @@
 			"properties": {
 				"temperature": {
 					"type": "number",
-					"description": "I can be the papa you'd be the mama"
+					"description": "The value used to modulate the next token probabilities."
+				},
+				"topK": {
+					"type": "integer",
+					"description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
+				},
+				"topP": {
+					"type": "number",
+					"description": "If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation."
+				},
+				"typicalP": {
+					"type": "number",
+					"description": " Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to typical_p or higher are kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details."
+				},
+				"epsilonCutoff": {
+					"type": "number",
+					"description": "If set to float strictly between 0 and 1, only tokens with a conditional probability greater than epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
+				},
+				"etaCutoff": {
+					"type": "number",
+					"description": "Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
+				},
+				"maxLength": {
+					"type": "integer",
+					"description": "The maximum length (in tokens) of the generated text, including the input."
+				},
+				"maxNewTokens": {
+					"type": "integer",
+					"description": "The maximum number of tokens to generate. Takes precedence over maxLength."
+				},
+				"minLength": {
+					"type": "integer",
+					"description": "The minimum length (in tokens) of the generated text, including the input."
+				},
+				"minNewTokens": {
+					"type": "integer",
+					"description": "The minimum number of tokens to generate. Takes precedence over maxLength."
+				},
+				"doSample": {
+					"type": "boolean",
+					"description": "Whether to use sampling instead of greedy decoding when generating new tokens."
+				},
+				"earlyStopping": {
+					"description": "Controls the stopping condition for beam-based methods.",
+					"oneOf": [{ "type": "boolean" }, { "const": "never", "type": "string" }]
+				},
+				"numBeams": {
+					"type": "integer",
+					"description": "Number of beams to use for beam search."
+				},
+				"numBeamGroups": {
+					"type": "integer",
+					"description": "Number of groups to divide num_beams into in order to ensure diversity among different groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details."
+				},
+				"penaltyAlpha": {
+					"type": "number",
+					"description": "The value balances the model confidence and the degeneration penalty in contrastive search decoding."
+				},
+				"useCache": {
+					"type": "boolean",
+					"description": "Whether the model should use the past last key/values attentions to speed up decoding"
 				}
 			}
 		}
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
index 41796240a..14c484bf2 100644
--- a/packages/tasks/src/tasks/text-to-audio/inference.ts
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -36,11 +36,91 @@ export interface TextToAudioParameters {
  */
 export interface GenerationParameters {
 	/**
-	 * I can be the papa you'd be the mama
+	 * Whether to use sampling instead of greedy decoding when generating new tokens.
+	 */
+	doSample?: boolean;
+	/**
+	 * Controls the stopping condition for beam-based methods.
+	 */
+	earlyStopping?: EarlyStoppingUnion;
+	/**
+	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
+	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+	 */
+	epsilonCutoff?: number;
+	/**
+	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+	 * float strictly between 0 and 1, a token is only considered if it is greater than either
+	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+	 * for more details.
+	 */
+	etaCutoff?: number;
+	/**
+	 * The maximum length (in tokens) of the generated text, including the input.
+	 */
+	maxLength?: number;
+	/**
+	 * The maximum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The minimum length (in tokens) of the generated text, including the input.
+	 */
+	minLength?: number;
+	/**
+	 * The minimum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	minNewTokens?: number;
+	/**
+	 * Number of groups to divide num_beams into in order to ensure diversity among different
+	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+	 */
+	numBeamGroups?: number;
+	/**
+	 * Number of beams to use for beam search.
+	 */
+	numBeams?: number;
+	/**
+	 * The value balances the model confidence and the degeneration penalty in contrastive
+	 * search decoding.
+	 */
+	penaltyAlpha?: number;
+	/**
+	 * The value used to modulate the next token probabilities.
 	 */
 	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
+	 * that add up to top_p or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Local typicality measures how similar the conditional probability of predicting a target
+	 * token next is to the expected conditional probability of predicting a random token next,
+	 * given the partial text already generated. If set to float < 1, the smallest set of the
+	 * most locally typical tokens with probabilities that add up to typical_p or higher are
+	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+	 */
+	typicalP?: number;
+	/**
+	 * Whether the model should use the past last key/values attentions to speed up decoding
+	 */
+	useCache?: boolean;
 	[property: string]: unknown;
 }
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+export type EarlyStoppingUnion = boolean | "never";
 export type TextToAudioOutput = TextToAudioOutputElement[];
 /**
  * Outputs of inference for the Text To Audio task
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index a89974072..f67e03652 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -41,12 +41,93 @@ export interface TextToAudioParameters {
  */
 export interface GenerationParameters {
 	/**
-	 * I can be the papa you'd be the mama
+	 * Whether to use sampling instead of greedy decoding when generating new tokens.
+	 */
+	doSample?: boolean;
+	/**
+	 * Controls the stopping condition for beam-based methods.
+	 */
+	earlyStopping?: EarlyStoppingUnion;
+	/**
+	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
+	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+	 */
+	epsilonCutoff?: number;
+	/**
+	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+	 * float strictly between 0 and 1, a token is only considered if it is greater than either
+	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+	 * for more details.
+	 */
+	etaCutoff?: number;
+	/**
+	 * The maximum length (in tokens) of the generated text, including the input.
+	 */
+	maxLength?: number;
+	/**
+	 * The maximum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The minimum length (in tokens) of the generated text, including the input.
+	 */
+	minLength?: number;
+	/**
+	 * The minimum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	minNewTokens?: number;
+	/**
+	 * Number of groups to divide num_beams into in order to ensure diversity among different
+	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+	 */
+	numBeamGroups?: number;
+	/**
+	 * Number of beams to use for beam search.
+	 */
+	numBeams?: number;
+	/**
+	 * The value balances the model confidence and the degeneration penalty in contrastive
+	 * search decoding.
+	 */
+	penaltyAlpha?: number;
+	/**
+	 * The value used to modulate the next token probabilities.
 	 */
 	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
+	 * that add up to top_p or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Local typicality measures how similar the conditional probability of predicting a target
+	 * token next is to the expected conditional probability of predicting a random token next,
+	 * given the partial text already generated. If set to float < 1, the smallest set of the
+	 * most locally typical tokens with probabilities that add up to typical_p or higher are
+	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+	 */
+	typicalP?: number;
+	/**
+	 * Whether the model should use the past last key/values attentions to speed up decoding
+	 */
+	useCache?: boolean;
 	[property: string]: unknown;
 }
 
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+export type EarlyStoppingUnion = boolean | "never";
+
 /**
  * Outputs for Text to Speech inference
  *

From 0000f02cfd68b86bd9865e396a3672a53a370225 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 17:15:11 +0100
Subject: [PATCH 46/51] =?UTF-8?q?e=E2=99=BB=EF=B8=8F=20Factorize=20common?=
 =?UTF-8?q?=20classification=20types?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tasks/audio-classification/inference.ts   | 10 +++---
 .../audio-classification/spec/input.json      | 14 +--------
 .../tasks/image-classification/inference.ts   | 12 +++----
 .../image-classification/spec/input.json      | 14 +--------
 .../image-classification/spec/output.json     | 13 +-------
 packages/tasks/src/tasks/schema-utils.json    | 31 +++++++++++++++++++
 .../tasks/text-classification/inference.ts    | 12 +++----
 .../tasks/text-classification/spec/input.json | 14 +--------
 .../text-classification/spec/output.json      | 13 +-------
 .../tasks/video-classification/inference.ts   | 12 +++----
 .../video-classification/spec/input.json      | 14 +--------
 .../video-classification/spec/output.json     | 13 +-------
 .../zero-shot-classification/inference.ts     |  4 +--
 .../zero-shot-classification/spec/output.json | 13 +-------
 .../inference.ts                              |  4 +--
 .../spec/output.json                          | 13 +-------
 16 files changed, 67 insertions(+), 139 deletions(-)

diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index 6671cdf14..ae37f29ac 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -23,17 +23,17 @@ export interface AudioClassificationInput {
  * Additional inference parameters for Audio Classification
  */
 export interface AudioClassificationParameters {
-	/**
-	 * The function to apply to the model outputs in order to retrieve the scores.
-	 */
-	functionToApply?: AudioClassificationOutputTransform;
+	functionToApply?: ClassificationOutputTransform;
 	/**
 	 * When specified, limits the output to the top K most probable classes.
 	 */
 	topK?: number;
 	[property: string]: unknown;
 }
-export type AudioClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+/**
+ * The function to apply to the model outputs in order to retrieve the scores.
+ */
+export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
 export type AudioClassificationOutput = AudioClassificationOutputElement[];
 /**
  * Outputs for Audio Classification inference
diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index 80e8651fe..4bf3639e6 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -21,19 +21,7 @@
 			"properties": {
 				"functionToApply": {
 					"title": "AudioClassificationOutputTransform",
-					"type": "string",
-					"description": "The function to apply to the model outputs in order to retrieve the scores.",
-					"oneOf": [
-						{
-							"const": "sigmoid"
-						},
-						{
-							"const": "softmax"
-						},
-						{
-							"const": "none"
-						}
-					]
+					"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutputTransform"
 				},
 				"topK": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index 5a43acdf5..7138a5073 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -23,24 +23,24 @@ export interface ImageClassificationInput {
  * Additional inference parameters for Image Classification
  */
 export interface ImageClassificationParameters {
-	/**
-	 * The function to apply to the model outputs in order to retrieve the scores.
-	 */
-	functionToApply?: ImageClassificationOutputTransform;
+	functionToApply?: ClassificationOutputTransform;
 	/**
 	 * When specified, limits the output to the top K most probable classes.
 	 */
 	topK?: number;
 	[property: string]: unknown;
 }
-export type ImageClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+/**
+ * The function to apply to the model outputs in order to retrieve the scores.
+ */
+export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
 export type ImageClassificationOutput = ImageClassificationOutputElement[];
 /**
  * Outputs of inference for the Image Classification task
  */
 export interface ImageClassificationOutputElement {
 	/**
-	 * The predicted class label (model specific).
+	 * The predicted class label.
 	 */
 	label: string;
 	/**
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
index 1dee66b97..081c05a5f 100644
--- a/packages/tasks/src/tasks/image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -21,19 +21,7 @@
 			"properties": {
 				"functionToApply": {
 					"title": "ImageClassificationOutputTransform",
-					"type": "string",
-					"description": "The function to apply to the model outputs in order to retrieve the scores.",
-					"oneOf": [
-						{
-							"const": "sigmoid"
-						},
-						{
-							"const": "softmax"
-						},
-						{
-							"const": "none"
-						}
-					]
+					"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutputTransform"
 				},
 				"topK": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/image-classification/spec/output.json b/packages/tasks/src/tasks/image-classification/spec/output.json
index a875898b6..b7b8ed424 100644
--- a/packages/tasks/src/tasks/image-classification/spec/output.json
+++ b/packages/tasks/src/tasks/image-classification/spec/output.json
@@ -5,17 +5,6 @@
 	"title": "ImageClassificationOutput",
 	"type": "array",
 	"items": {
-		"type": "object",
-		"properties": {
-			"label": {
-				"type": "string",
-				"description": "The predicted class label (model specific)."
-			},
-			"score": {
-				"type": "number",
-				"description": "The corresponding probability."
-			}
-		},
-		"required": ["label", "score"]
+		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/schema-utils.json b/packages/tasks/src/tasks/schema-utils.json
index 5a3d3e812..49157797a 100644
--- a/packages/tasks/src/tasks/schema-utils.json
+++ b/packages/tasks/src/tasks/schema-utils.json
@@ -3,6 +3,37 @@
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "(Incomplete!) Common type definitions shared by several tasks",
 	"definitions": {
+		"ClassificationOutputTransform": {
+			"title": "ClassificationOutputTransform",
+			"type": "string",
+			"description": "The function to apply to the model outputs in order to retrieve the scores.",
+			"oneOf": [
+				{
+					"const": "sigmoid"
+				},
+				{
+					"const": "softmax"
+				},
+				{
+					"const": "none"
+				}
+			]
+		},
+		"ClassificationOutput": {
+			"title": "ClassificationOutput",
+			"type": "object",
+			"properties": {
+				"label": {
+					"type": "string",
+					"description": "The predicted class label."
+				},
+				"score": {
+					"type": "number",
+					"description": "The corresponding probability."
+				}
+			},
+			"required": ["label", "score"]
+		},
 		"GenerationParameters": {
 			"title": "GenerationParameters",
 			"description": "Ad-hoc parametrization of the text generation process",
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
index 19298ccd0..9bc728a50 100644
--- a/packages/tasks/src/tasks/text-classification/inference.ts
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -23,24 +23,24 @@ export interface TextClassificationInput {
  * Additional inference parameters for Text Classification
  */
 export interface TextClassificationParameters {
-	/**
-	 * The function to apply to the model outputs in order to retrieve the scores.
-	 */
-	functionToApply?: TextClassificationOutputTransform;
+	functionToApply?: ClassificationOutputTransform;
 	/**
 	 * When specified, limits the output to the top K most probable classes.
 	 */
 	topK?: number;
 	[property: string]: unknown;
 }
-export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+/**
+ * The function to apply to the model outputs in order to retrieve the scores.
+ */
+export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
 export type TextClassificationOutput = TextClassificationOutputElement[];
 /**
  * Outputs of inference for the Text Classification task
  */
 export interface TextClassificationOutputElement {
 	/**
-	 * The predicted class label (model specific).
+	 * The predicted class label.
 	 */
 	label: string;
 	/**
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
index 08bac5953..87031422c 100644
--- a/packages/tasks/src/tasks/text-classification/spec/input.json
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -22,19 +22,7 @@
 			"properties": {
 				"functionToApply": {
 					"title": "TextClassificationOutputTransform",
-					"type": "string",
-					"description": "The function to apply to the model outputs in order to retrieve the scores.",
-					"oneOf": [
-						{
-							"const": "sigmoid"
-						},
-						{
-							"const": "softmax"
-						},
-						{
-							"const": "none"
-						}
-					]
+					"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutputTransform"
 				},
 				"topK": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/text-classification/spec/output.json b/packages/tasks/src/tasks/text-classification/spec/output.json
index b2b81acde..95d1ca5ee 100644
--- a/packages/tasks/src/tasks/text-classification/spec/output.json
+++ b/packages/tasks/src/tasks/text-classification/spec/output.json
@@ -5,17 +5,6 @@
 	"title": "TextClassificationOutput",
 	"type": "array",
 	"items": {
-		"type": "object",
-		"properties": {
-			"label": {
-				"type": "string",
-				"description": "The predicted class label (model specific)."
-			},
-			"score": {
-				"type": "number",
-				"description": "The corresponding probability."
-			}
-		},
-		"required": ["label", "score"]
+		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
index ede6a25e4..1f765160f 100644
--- a/packages/tasks/src/tasks/video-classification/inference.ts
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -27,10 +27,7 @@ export interface VideoClassificationParameters {
 	 * The sampling rate used to select frames from the video.
 	 */
 	frameSamplingRate?: number;
-	/**
-	 * The function to apply to the model outputs in order to retrieve the scores.
-	 */
-	functionToApply?: TextClassificationOutputTransform;
+	functionToApply?: ClassificationOutputTransform;
 	/**
 	 * The number of sampled frames to consider for classification.
 	 */
@@ -41,14 +38,17 @@ export interface VideoClassificationParameters {
 	topK?: number;
 	[property: string]: unknown;
 }
-export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+/**
+ * The function to apply to the model outputs in order to retrieve the scores.
+ */
+export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
 export type VideoClassificationOutput = VideoClassificationOutputElement[];
 /**
  * Outputs of inference for the Video Classification task
  */
 export interface VideoClassificationOutputElement {
 	/**
-	 * The predicted class label (model specific).
+	 * The predicted class label.
 	 */
 	label: string;
 	/**
diff --git a/packages/tasks/src/tasks/video-classification/spec/input.json b/packages/tasks/src/tasks/video-classification/spec/input.json
index 386992c9a..0607bbf52 100644
--- a/packages/tasks/src/tasks/video-classification/spec/input.json
+++ b/packages/tasks/src/tasks/video-classification/spec/input.json
@@ -21,19 +21,7 @@
 			"properties": {
 				"functionToApply": {
 					"title": "TextClassificationOutputTransform",
-					"type": "string",
-					"description": "The function to apply to the model outputs in order to retrieve the scores.",
-					"oneOf": [
-						{
-							"const": "sigmoid"
-						},
-						{
-							"const": "softmax"
-						},
-						{
-							"const": "none"
-						}
-					]
+					"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutputTransform"
 				},
 				"numFrames": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/video-classification/spec/output.json b/packages/tasks/src/tasks/video-classification/spec/output.json
index 9220cdbae..247aae997 100644
--- a/packages/tasks/src/tasks/video-classification/spec/output.json
+++ b/packages/tasks/src/tasks/video-classification/spec/output.json
@@ -5,17 +5,6 @@
 	"title": "VideoClassificationOutput",
 	"type": "array",
 	"items": {
-		"type": "object",
-		"properties": {
-			"label": {
-				"type": "string",
-				"description": "The predicted class label (model specific)."
-			},
-			"score": {
-				"type": "number",
-				"description": "The corresponding probability."
-			}
-		},
-		"required": ["label", "score"]
+		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
index db7f0c8bb..e0b43ec70 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -56,11 +56,11 @@ export type ZeroShotClassificationOutput = ZeroShotClassificationOutputElement[]
  */
 export interface ZeroShotClassificationOutputElement {
 	/**
-	 * A candidate label
+	 * The predicted class label.
 	 */
 	label: string;
 	/**
-	 * The associated score / probability
+	 * The corresponding probability.
 	 */
 	score: number;
 	[property: string]: unknown;
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
index 27ad4b00e..d9e78c231 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
@@ -5,17 +5,6 @@
 	"title": "ZeroShotClassificationOutput",
 	"type": "array",
 	"items": {
-		"type": "object",
-		"properties": {
-			"label": {
-				"type": "string",
-				"description": "A candidate label"
-			},
-			"score": {
-				"type": "number",
-				"description": "The associated score / probability"
-			}
-		},
-		"required": ["label", "score"]
+		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 22308aabb..2bea5436b 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -50,11 +50,11 @@ export type ZeroShotImageClassificationOutput = ZeroShotImageClassificationOutpu
  */
 export interface ZeroShotImageClassificationOutputElement {
 	/**
-	 * A candidate label
+	 * The predicted class label.
 	 */
 	label: string;
 	/**
-	 * The associated score / probability
+	 * The corresponding probability.
 	 */
 	score: number;
 	[property: string]: unknown;
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
index 2b0e78b84..68a5ecfb0 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
@@ -5,17 +5,6 @@
 	"title": "ZeroShotImageClassificationOutput",
 	"type": "array",
 	"items": {
-		"type": "object",
-		"properties": {
-			"label": {
-				"type": "string",
-				"description": "A candidate label"
-			},
-			"score": {
-				"type": "number",
-				"description": "The associated score / probability"
-			}
-		},
-		"required": ["label", "score"]
+		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
 	}
 }

From 8dc4d172d21d77023a37a5627b273982c3adcdc2 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 17:17:35 +0100
Subject: [PATCH 47/51] fix: await writefile in post process

---
 .../tasks/src/scripts/inference-codegen.ts    |  2 +-
 .../zero-shot-object-detection/inference.ts   | 66 +++++++++++++++++++
 .../spec/output.json                          |  1 +
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index ac72ff9f7..8ad7fc3e0 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -131,7 +131,7 @@ async function postProcessOutput(path2generated: string, outputSpec: Record<stri
 		updatedInterface,
 	]);
 
-	fs.writeFile(path2generated, printer.printList(ts.ListFormat.MultiLine, newNodes, source), {
+	await fs.writeFile(path2generated, printer.printList(ts.ListFormat.MultiLine, newNodes, source), {
 		flag: "w+",
 		encoding: "utf-8",
 	});
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index e69de29bb..edb51172e 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -0,0 +1,66 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Zero Shot Object Detection inference
+ */
+export interface ZeroShotObjectDetectionInput {
+	/**
+	 * The input image data, with candidate labels
+	 */
+	data: ZeroShotObjectDetectionInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: {
+		[key: string]: unknown;
+	};
+	[property: string]: unknown;
+}
+/**
+ * The input image data, with candidate labels
+ */
+export interface ZeroShotObjectDetectionInputData {
+	/**
+	 * The candidate labels for this image
+	 */
+	candidateLabels: string[];
+	/**
+	 * The image data to generate bounding boxes from
+	 */
+	image: unknown;
+	[property: string]: unknown;
+}
+/**
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
+ * image.
+ */
+export interface BoundingBox {
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: unknown;
+}
+export type ZeroShotObjectDetectionOutput = ZeroShotObjectDetectionOutputElement[];
+/**
+ * Outputs of inference for the Zero Shot Object Detection task
+ */
+export interface ZeroShotObjectDetectionOutputElement {
+	/**
+	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
+	 * image.
+	 */
+	box: BoundingBox;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
index 6293efc97..8afa60527 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
@@ -6,6 +6,7 @@
 	"type": "array",
 	"items": {
 		"type": "object",
+		"title": "ZeroShotObjectDetectionOutputElement",
 		"properties": {
 			"label": {
 				"type": "string",

From 9ccb3a44dbb2234d747b7cc067c772a729794f2d Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 17:19:40 +0100
Subject: [PATCH 48/51] add scheduler param

---
 packages/tasks/src/tasks/text-to-image/spec/input.json | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/packages/tasks/src/tasks/text-to-image/spec/input.json b/packages/tasks/src/tasks/text-to-image/spec/input.json
index 32a076dd0..cb1e1c6cf 100644
--- a/packages/tasks/src/tasks/text-to-image/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-image/spec/input.json
@@ -45,6 +45,10 @@
 						}
 					},
 					"required": ["width", "height"]
+				},
+				"scheduler": {
+					"type": "string",
+					"description": "For diffusion models. Override the scheduler with a compatible one"
 				}
 			}
 		}

From accdeffba68542882b8f19921dd1c30a5ee25c16 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 17:21:59 +0100
Subject: [PATCH 49/51] rename schema-utls to common-definitions

---
 .../tasks/src/scripts/inference-codegen.ts    |   2 +-
 .../audio-classification/spec/input.json      |   2 +-
 .../spec/input.json                           |   2 +-
 .../image-classification/spec/input.json      |   2 +-
 .../image-classification/spec/output.json     |   2 +-
 .../src/tasks/image-to-text/spec/input.json   |   2 +-
 packages/tasks/src/tasks/schema-utils.json    | 109 ------------------
 .../tasks/text-classification/spec/input.json |   2 +-
 .../text-classification/spec/output.json      |   2 +-
 .../src/tasks/text-to-audio/spec/input.json   |   2 +-
 .../src/tasks/text-to-image/inference.ts      |   4 +
 .../video-classification/spec/input.json      |   2 +-
 .../video-classification/spec/output.json     |   2 +-
 .../zero-shot-classification/spec/output.json |   2 +-
 .../spec/output.json                          |   2 +-
 15 files changed, 17 insertions(+), 122 deletions(-)
 delete mode 100644 packages/tasks/src/tasks/schema-utils.json

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index 8ad7fc3e0..02c8e3003 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -149,7 +149,7 @@ async function main() {
 			.map(async (entry) => ({ task: entry.name, dirPath: path.join(entry.path, entry.name) }))
 	);
 	const allSpecFiles = [
-		path.join(tasksDir, "schema-utils.json"),
+		path.join(tasksDir, "common-definitions.json"),
 		...allTasks
 			.flatMap(({ dirPath }) => [path.join(dirPath, "spec", "input.json"), path.join(dirPath, "spec", "output.json")])
 			.filter((filepath) => pathExists(filepath)),
diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index 4bf3639e6..cfd5a54a6 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -21,7 +21,7 @@
 			"properties": {
 				"functionToApply": {
 					"title": "AudioClassificationOutputTransform",
-					"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutputTransform"
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
 				},
 				"topK": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
index 93621151e..2d31957ed 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -25,7 +25,7 @@
 				},
 				"generate": {
 					"description": "Parametrization of the text generation process",
-					"$ref": "/inference/schemas/schema-utils.json#/definitions/GenerationParameters"
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/GenerationParameters"
 				}
 			}
 		}
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
index 081c05a5f..362c0d517 100644
--- a/packages/tasks/src/tasks/image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -21,7 +21,7 @@
 			"properties": {
 				"functionToApply": {
 					"title": "ImageClassificationOutputTransform",
-					"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutputTransform"
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
 				},
 				"topK": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/image-classification/spec/output.json b/packages/tasks/src/tasks/image-classification/spec/output.json
index b7b8ed424..2a3264bce 100644
--- a/packages/tasks/src/tasks/image-classification/spec/output.json
+++ b/packages/tasks/src/tasks/image-classification/spec/output.json
@@ -5,6 +5,6 @@
 	"title": "ImageClassificationOutput",
 	"type": "array",
 	"items": {
-		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
+		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/image-to-text/spec/input.json b/packages/tasks/src/tasks/image-to-text/spec/input.json
index b074372fc..0ef8ba1dc 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/input.json
@@ -25,7 +25,7 @@
 				},
 				"generate": {
 					"description": "Parametrization of the text generation process",
-					"$ref": "/inference/schemas/schema-utils.json#/definitions/GenerationParameters"
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/GenerationParameters"
 				}
 			}
 		}
diff --git a/packages/tasks/src/tasks/schema-utils.json b/packages/tasks/src/tasks/schema-utils.json
deleted file mode 100644
index 49157797a..000000000
--- a/packages/tasks/src/tasks/schema-utils.json
+++ /dev/null
@@ -1,109 +0,0 @@
-{
-	"$id": "/inference/schemas/schema-utils.json",
-	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "(Incomplete!) Common type definitions shared by several tasks",
-	"definitions": {
-		"ClassificationOutputTransform": {
-			"title": "ClassificationOutputTransform",
-			"type": "string",
-			"description": "The function to apply to the model outputs in order to retrieve the scores.",
-			"oneOf": [
-				{
-					"const": "sigmoid"
-				},
-				{
-					"const": "softmax"
-				},
-				{
-					"const": "none"
-				}
-			]
-		},
-		"ClassificationOutput": {
-			"title": "ClassificationOutput",
-			"type": "object",
-			"properties": {
-				"label": {
-					"type": "string",
-					"description": "The predicted class label."
-				},
-				"score": {
-					"type": "number",
-					"description": "The corresponding probability."
-				}
-			},
-			"required": ["label", "score"]
-		},
-		"GenerationParameters": {
-			"title": "GenerationParameters",
-			"description": "Ad-hoc parametrization of the text generation process",
-			"type": "object",
-			"properties": {
-				"temperature": {
-					"type": "number",
-					"description": "The value used to modulate the next token probabilities."
-				},
-				"topK": {
-					"type": "integer",
-					"description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
-				},
-				"topP": {
-					"type": "number",
-					"description": "If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation."
-				},
-				"typicalP": {
-					"type": "number",
-					"description": " Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to typical_p or higher are kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details."
-				},
-				"epsilonCutoff": {
-					"type": "number",
-					"description": "If set to float strictly between 0 and 1, only tokens with a conditional probability greater than epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
-				},
-				"etaCutoff": {
-					"type": "number",
-					"description": "Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
-				},
-				"maxLength": {
-					"type": "integer",
-					"description": "The maximum length (in tokens) of the generated text, including the input."
-				},
-				"maxNewTokens": {
-					"type": "integer",
-					"description": "The maximum number of tokens to generate. Takes precedence over maxLength."
-				},
-				"minLength": {
-					"type": "integer",
-					"description": "The minimum length (in tokens) of the generated text, including the input."
-				},
-				"minNewTokens": {
-					"type": "integer",
-					"description": "The minimum number of tokens to generate. Takes precedence over maxLength."
-				},
-				"doSample": {
-					"type": "boolean",
-					"description": "Whether to use sampling instead of greedy decoding when generating new tokens."
-				},
-				"earlyStopping": {
-					"description": "Controls the stopping condition for beam-based methods.",
-					"oneOf": [{ "type": "boolean" }, { "const": "never", "type": "string" }]
-				},
-				"numBeams": {
-					"type": "integer",
-					"description": "Number of beams to use for beam search."
-				},
-				"numBeamGroups": {
-					"type": "integer",
-					"description": "Number of groups to divide num_beams into in order to ensure diversity among different groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details."
-				},
-				"penaltyAlpha": {
-					"type": "number",
-					"description": "The value balances the model confidence and the degeneration penalty in contrastive search decoding."
-				},
-				"useCache": {
-					"type": "boolean",
-					"description": "Whether the model should use the past last key/values attentions to speed up decoding"
-				}
-			}
-		}
-	}
-}
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
index 87031422c..6ae6f1c39 100644
--- a/packages/tasks/src/tasks/text-classification/spec/input.json
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -22,7 +22,7 @@
 			"properties": {
 				"functionToApply": {
 					"title": "TextClassificationOutputTransform",
-					"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutputTransform"
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
 				},
 				"topK": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/text-classification/spec/output.json b/packages/tasks/src/tasks/text-classification/spec/output.json
index 95d1ca5ee..704b82225 100644
--- a/packages/tasks/src/tasks/text-classification/spec/output.json
+++ b/packages/tasks/src/tasks/text-classification/spec/output.json
@@ -5,6 +5,6 @@
 	"title": "TextClassificationOutput",
 	"type": "array",
 	"items": {
-		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
+		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/input.json b/packages/tasks/src/tasks/text-to-audio/spec/input.json
index d049fb02e..95bd8d16d 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/input.json
@@ -22,7 +22,7 @@
 			"properties": {
 				"generate": {
 					"description": "Parametrization of the text generation process",
-					"$ref": "/inference/schemas/schema-utils.json#/definitions/GenerationParameters"
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/GenerationParameters"
 				}
 			}
 		}
diff --git a/packages/tasks/src/tasks/text-to-image/inference.ts b/packages/tasks/src/tasks/text-to-image/inference.ts
index bfe0433f3..c25031b29 100644
--- a/packages/tasks/src/tasks/text-to-image/inference.ts
+++ b/packages/tasks/src/tasks/text-to-image/inference.ts
@@ -39,6 +39,10 @@ export interface TextToImageParameters {
 	 * a higher quality image at the expense of slower inference.
 	 */
 	numInferenceSteps?: number;
+	/**
+	 * For diffusion models. Override the scheduler with a compatible one
+	 */
+	scheduler?: string;
 	/**
 	 * The size in pixel of the output image
 	 */
diff --git a/packages/tasks/src/tasks/video-classification/spec/input.json b/packages/tasks/src/tasks/video-classification/spec/input.json
index 0607bbf52..984670953 100644
--- a/packages/tasks/src/tasks/video-classification/spec/input.json
+++ b/packages/tasks/src/tasks/video-classification/spec/input.json
@@ -21,7 +21,7 @@
 			"properties": {
 				"functionToApply": {
 					"title": "TextClassificationOutputTransform",
-					"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutputTransform"
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
 				},
 				"numFrames": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/video-classification/spec/output.json b/packages/tasks/src/tasks/video-classification/spec/output.json
index 247aae997..4c24f5d57 100644
--- a/packages/tasks/src/tasks/video-classification/spec/output.json
+++ b/packages/tasks/src/tasks/video-classification/spec/output.json
@@ -5,6 +5,6 @@
 	"title": "VideoClassificationOutput",
 	"type": "array",
 	"items": {
-		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
+		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
index d9e78c231..83ed1098f 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
@@ -5,6 +5,6 @@
 	"title": "ZeroShotClassificationOutput",
 	"type": "array",
 	"items": {
-		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
+		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
index 68a5ecfb0..6b795fbdb 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
@@ -5,6 +5,6 @@
 	"title": "ZeroShotImageClassificationOutput",
 	"type": "array",
 	"items": {
-		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
+		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
 	}
 }

From 3a3d4ba6318d947e40d73de3469f0e27dafde1e6 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 17:24:21 +0100
Subject: [PATCH 50/51] proper type for table QA

---
 packages/tasks/src/tasks/table-question-answering/inference.ts | 2 +-
 .../tasks/src/tasks/table-question-answering/spec/input.json   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
index ac04c6a32..fe06dbbfe 100644
--- a/packages/tasks/src/tasks/table-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -31,7 +31,7 @@ export interface TableQuestionAnsweringInputData {
 	 * The table to serve as context for the questions
 	 */
 	table: {
-		[key: string]: unknown;
+		[key: string]: string[];
 	};
 	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/input.json b/packages/tasks/src/tasks/table-question-answering/spec/input.json
index e3fc6db9f..6309cf1f3 100644
--- a/packages/tasks/src/tasks/table-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/table-question-answering/spec/input.json
@@ -12,7 +12,8 @@
 			"properties": {
 				"table": {
 					"description": "The table to serve as context for the questions",
-					"type": "object"
+					"type": "object",
+					"additionalProperties": { "type": "array", "items": { "type": "string" } }
 				},
 				"question": {
 					"description": "The question to be answered about the table",

From 4742c9ed37c44bf413f43474e8804e4ea44417f1 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 17:28:13 +0100
Subject: [PATCH 51/51] oops I forgot to commit the new file after rename

---
 .../tasks/src/tasks/common-definitions.json   | 109 ++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 packages/tasks/src/tasks/common-definitions.json

diff --git a/packages/tasks/src/tasks/common-definitions.json b/packages/tasks/src/tasks/common-definitions.json
new file mode 100644
index 000000000..6e0ec532d
--- /dev/null
+++ b/packages/tasks/src/tasks/common-definitions.json
@@ -0,0 +1,109 @@
+{
+	"$id": "/inference/schemas/common-definitions.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "(Incomplete!) Common type definitions shared by several tasks",
+	"definitions": {
+		"ClassificationOutputTransform": {
+			"title": "ClassificationOutputTransform",
+			"type": "string",
+			"description": "The function to apply to the model outputs in order to retrieve the scores.",
+			"oneOf": [
+				{
+					"const": "sigmoid"
+				},
+				{
+					"const": "softmax"
+				},
+				{
+					"const": "none"
+				}
+			]
+		},
+		"ClassificationOutput": {
+			"title": "ClassificationOutput",
+			"type": "object",
+			"properties": {
+				"label": {
+					"type": "string",
+					"description": "The predicted class label."
+				},
+				"score": {
+					"type": "number",
+					"description": "The corresponding probability."
+				}
+			},
+			"required": ["label", "score"]
+		},
+		"GenerationParameters": {
+			"title": "GenerationParameters",
+			"description": "Ad-hoc parametrization of the text generation process",
+			"type": "object",
+			"properties": {
+				"temperature": {
+					"type": "number",
+					"description": "The value used to modulate the next token probabilities."
+				},
+				"topK": {
+					"type": "integer",
+					"description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
+				},
+				"topP": {
+					"type": "number",
+					"description": "If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation."
+				},
+				"typicalP": {
+					"type": "number",
+					"description": " Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to typical_p or higher are kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details."
+				},
+				"epsilonCutoff": {
+					"type": "number",
+					"description": "If set to float strictly between 0 and 1, only tokens with a conditional probability greater than epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
+				},
+				"etaCutoff": {
+					"type": "number",
+					"description": "Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
+				},
+				"maxLength": {
+					"type": "integer",
+					"description": "The maximum length (in tokens) of the generated text, including the input."
+				},
+				"maxNewTokens": {
+					"type": "integer",
+					"description": "The maximum number of tokens to generate. Takes precedence over maxLength."
+				},
+				"minLength": {
+					"type": "integer",
+					"description": "The minimum length (in tokens) of the generated text, including the input."
+				},
+				"minNewTokens": {
+					"type": "integer",
+					"description": "The minimum number of tokens to generate. Takes precedence over maxLength."
+				},
+				"doSample": {
+					"type": "boolean",
+					"description": "Whether to use sampling instead of greedy decoding when generating new tokens."
+				},
+				"earlyStopping": {
+					"description": "Controls the stopping condition for beam-based methods.",
+					"oneOf": [{ "type": "boolean" }, { "const": "never", "type": "string" }]
+				},
+				"numBeams": {
+					"type": "integer",
+					"description": "Number of beams to use for beam search."
+				},
+				"numBeamGroups": {
+					"type": "integer",
+					"description": "Number of groups to divide num_beams into in order to ensure diversity among different groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details."
+				},
+				"penaltyAlpha": {
+					"type": "number",
+					"description": "The value balances the model confidence and the degeneration penalty in contrastive search decoding."
+				},
+				"useCache": {
+					"type": "boolean",
+					"description": "Whether the model should use the past last key/values attentions to speed up decoding"
+				}
+			}
+		}
+	}
+}