dnth · dnth · Nov 27, 2024 · Nov 27, 2024 · Nov 27, 2024
diff --git a/nbs/smolvlm.ipynb b/nbs/smolvlm.ipynb
@@ -0,0 +1,154 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import xinfer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                            Available Models                             </span>\n",
+       "┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃<span style=\"font-weight: bold\"> Implementation </span>┃<span style=\"font-weight: bold\"> Model ID                       </span>┃<span style=\"font-weight: bold\"> Input --&gt; Output    </span>┃\n",
+       "┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers   </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> HuggingFaceTB/SmolVLM-Instruct </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
+       "└────────────────┴────────────────────────────────┴─────────────────────┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m                            Available Models                             \u001b[0m\n",
+       "┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1mImplementation\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mModel ID                      \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mInput --> Output   \u001b[0m\u001b[1m \u001b[0m┃\n",
+       "┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n",
+       "│\u001b[36m \u001b[0m\u001b[36mtransformers  \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mHuggingFaceTB/SmolVLM-Instruct\u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
+       "└────────────────┴────────────────────────────────┴─────────────────────┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "xinfer.list_models(\"smol\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-11-27 13:31:49.434\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m63\u001b[0m - \u001b[1mModel: HuggingFaceTB/SmolVLM-Instruct\u001b[0m\n",
+      "\u001b[32m2024-11-27 13:31:49.434\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mDevice: cuda\u001b[0m\n",
+      "\u001b[32m2024-11-27 13:31:49.435\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m65\u001b[0m - \u001b[1mDtype: bfloat16\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = xinfer.create_model(\n",
+    "    \"HuggingFaceTB/SmolVLM-Instruct\", device=\"cuda\", dtype=\"bfloat16\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'The image depicts a street scene with a parade taking place. The street is lined with buildings, some of which are shops, and there are people standing on both sides of the street, watching the parade. The parade consists of a number of vehicles, including a truck, and a float with a large American flag prominently displayed. The truck is carrying a large number of flags, including the American flag, and there are also smaller flags on the float. The people on the street are dressed in a variety of ways, including some wearing winter coats and hats, indicating that the weather is likely cold.\\n\\nIn the background, there is a clock tower on a building, and a mountain can be seen in the distance. The sky is overcast, and the weather appears to be cloudy and possibly rainy. The people on the street are standing in various poses, some with their hands raised in the air, indicating excitement or participation in the parade.\\n\\nThe image captures a moment of community celebration and togetherness, with people from all walks of life coming together to enjoy a parade. The parade is likely an annual event, given the presence of the clock tower and the presence of multiple buildings, suggesting that it takes place in a town or city.\\n\\n### Analysis and Description:\\n1. **Parade Participants**:\\n   - The parade includes a truck and a float with an American flag prominently displayed.\\n   - The truck is carrying a large number of flags, including the American flag.'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "image = \"https://raw.githubusercontent.com/dnth/x.infer/main/assets/demo/00aa2580828a9009.jpg\"\n",
+    "prompt = \"Describe this image.\"\n",
+    "\n",
+    "model.infer(image, prompt).text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                          Model Info                          </span>\n",
+       "╭───────────────────────────┬────────────────────────────────╮\n",
+       "│<span style=\"font-weight: bold\"> Attribute                 </span>│<span style=\"font-weight: bold\"> Value                          </span>│\n",
+       "├───────────────────────────┼────────────────────────────────┤\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> Model ID                  </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> HuggingFaceTB/SmolVLM-Instruct </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> Device                    </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> cuda                           </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> Dtype                     </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> torch.bfloat16                 </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> Number of Inferences      </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 1                              </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> Total Inference Time (ms) </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 5042.1868                      </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> Average Latency (ms)      </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 5042.1868                      </span>│\n",
+       "╰───────────────────────────┴────────────────────────────────╯\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m                          Model Info                          \u001b[0m\n",
+       "╭───────────────────────────┬────────────────────────────────╮\n",
+       "│\u001b[1m \u001b[0m\u001b[1mAttribute                \u001b[0m\u001b[1m \u001b[0m│\u001b[1m \u001b[0m\u001b[1mValue                         \u001b[0m\u001b[1m \u001b[0m│\n",
+       "├───────────────────────────┼────────────────────────────────┤\n",
+       "│\u001b[36m \u001b[0m\u001b[36mModel ID                 \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mHuggingFaceTB/SmolVLM-Instruct\u001b[0m\u001b[35m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36mDevice                   \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mcuda                          \u001b[0m\u001b[35m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36mDtype                    \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mtorch.bfloat16                \u001b[0m\u001b[35m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36mNumber of Inferences     \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m1                             \u001b[0m\u001b[35m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36mTotal Inference Time (ms)\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m5042.1868                     \u001b[0m\u001b[35m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36mAverage Latency (ms)     \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m5042.1868                     \u001b[0m\u001b[35m \u001b[0m│\n",
+       "╰───────────────────────────┴────────────────────────────────╯\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "model.print_stats()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "xinfer",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/xinfer/transformers/__init__.py b/xinfer/transformers/__init__.py
@@ -6,6 +6,7 @@
 from .llama32 import Llama32Vision, Llama32VisionInstruct
 from .moondream import Moondream
 from .qwen2_vl import Qwen2VL
+from .smolvlm import SmolVLM
 from .vision2seq import Vision2SeqModel
 from .vlrm_blip2 import VLRMBlip2
 

diff --git a/xinfer/transformers/smolvlm.py b/xinfer/transformers/smolvlm.py
@@ -0,0 +1,48 @@
+from ..model_registry import register_model
+from ..models import track_inference
+from ..types import ModelInputOutput, Result
+from .vision2seq import Vision2SeqModel
+
+
+@register_model(
+    "HuggingFaceTB/SmolVLM-Instruct",
+    "transformers",
+    ModelInputOutput.IMAGE_TEXT_TO_TEXT,
+)
+class SmolVLM(Vision2SeqModel):
+    def __init__(self, model_id: str, device: str = "cpu", dtype: str = "float32"):
+        super().__init__(model_id, device, dtype)
+
+    @track_inference
+    def infer(self, image: str, text: str, **generate_kwargs) -> Result:
+        if "max_new_tokens" not in generate_kwargs:
+            generate_kwargs["max_new_tokens"] = 300
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": text},
+                ],
+            }
+        ]
+
+        prompt = self.processor.apply_chat_template(
+            messages, add_generation_prompt=True
+        )
+        inputs = self.processor(text=prompt, images=image, return_tensors="pt")
+        inputs = inputs.to(self.device)
+
+        generated_ids = self.model.generate(**inputs, **generate_kwargs)
+        generated_texts = self.processor.batch_decode(
+            generated_ids,
+            skip_special_tokens=True,
+        )
+
+        # Extract only the assistant's response
+        response = generated_texts[0].split("Assistant:", 1)[-1].strip()
+        return Result(text=response)
+
+    def infer_batch(self, *args, **kwargs):
+        raise NotImplementedError("SmolVLM does not support batch inference.")