Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SmolVLM model #73

Merged
merged 2 commits into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 154 additions & 0 deletions nbs/smolvlm.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import xinfer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\"> Available Models </span>\n",
"┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n",
"┃<span style=\"font-weight: bold\"> Implementation </span>┃<span style=\"font-weight: bold\"> Model ID </span>┃<span style=\"font-weight: bold\"> Input --&gt; Output </span>┃\n",
"┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> HuggingFaceTB/SmolVLM-Instruct </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
"└────────────────┴────────────────────────────────┴─────────────────────┘\n",
"</pre>\n"
],
"text/plain": [
"\u001b[3m Available Models \u001b[0m\n",
"┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mImplementation\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mModel ID \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mInput --> Output \u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n",
"│\u001b[36m \u001b[0m\u001b[36mtransformers \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mHuggingFaceTB/SmolVLM-Instruct\u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
"└────────────────┴────────────────────────────────┴─────────────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"xinfer.list_models(\"smol\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-11-27 13:31:49.434\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m63\u001b[0m - \u001b[1mModel: HuggingFaceTB/SmolVLM-Instruct\u001b[0m\n",
"\u001b[32m2024-11-27 13:31:49.434\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mDevice: cuda\u001b[0m\n",
"\u001b[32m2024-11-27 13:31:49.435\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m65\u001b[0m - \u001b[1mDtype: bfloat16\u001b[0m\n"
]
}
],
"source": [
"model = xinfer.create_model(\n",
" \"HuggingFaceTB/SmolVLM-Instruct\", device=\"cuda\", dtype=\"bfloat16\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'The image depicts a street scene with a parade taking place. The street is lined with buildings, some of which are shops, and there are people standing on both sides of the street, watching the parade. The parade consists of a number of vehicles, including a truck, and a float with a large American flag prominently displayed. The truck is carrying a large number of flags, including the American flag, and there are also smaller flags on the float. The people on the street are dressed in a variety of ways, including some wearing winter coats and hats, indicating that the weather is likely cold.\\n\\nIn the background, there is a clock tower on a building, and a mountain can be seen in the distance. The sky is overcast, and the weather appears to be cloudy and possibly rainy. The people on the street are standing in various poses, some with their hands raised in the air, indicating excitement or participation in the parade.\\n\\nThe image captures a moment of community celebration and togetherness, with people from all walks of life coming together to enjoy a parade. The parade is likely an annual event, given the presence of the clock tower and the presence of multiple buildings, suggesting that it takes place in a town or city.\\n\\n### Analysis and Description:\\n1. **Parade Participants**:\\n - The parade includes a truck and a float with an American flag prominently displayed.\\n - The truck is carrying a large number of flags, including the American flag.'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"image = \"https://raw.githubusercontent.com/dnth/x.infer/main/assets/demo/00aa2580828a9009.jpg\"\n",
"prompt = \"Describe this image.\"\n",
"\n",
"model.infer(image, prompt).text"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\"> Model Info </span>\n",
"╭───────────────────────────┬────────────────────────────────╮\n",
"│<span style=\"font-weight: bold\"> Attribute </span>│<span style=\"font-weight: bold\"> Value </span>│\n",
"├───────────────────────────┼────────────────────────────────┤\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> Model ID </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> HuggingFaceTB/SmolVLM-Instruct </span>│\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> Device </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> cuda </span>│\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> Dtype </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> torch.bfloat16 </span>│\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> Number of Inferences </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 1 </span>│\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> Total Inference Time (ms) </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 5042.1868 </span>│\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> Average Latency (ms) </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 5042.1868 </span>│\n",
"╰───────────────────────────┴────────────────────────────────╯\n",
"</pre>\n"
],
"text/plain": [
"\u001b[3m Model Info \u001b[0m\n",
"╭───────────────────────────┬────────────────────────────────╮\n",
"│\u001b[1m \u001b[0m\u001b[1mAttribute \u001b[0m\u001b[1m \u001b[0m│\u001b[1m \u001b[0m\u001b[1mValue \u001b[0m\u001b[1m \u001b[0m│\n",
"├───────────────────────────┼────────────────────────────────┤\n",
"│\u001b[36m \u001b[0m\u001b[36mModel ID \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mHuggingFaceTB/SmolVLM-Instruct\u001b[0m\u001b[35m \u001b[0m│\n",
"│\u001b[36m \u001b[0m\u001b[36mDevice \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mcuda \u001b[0m\u001b[35m \u001b[0m│\n",
"│\u001b[36m \u001b[0m\u001b[36mDtype \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mtorch.bfloat16 \u001b[0m\u001b[35m \u001b[0m│\n",
"│\u001b[36m \u001b[0m\u001b[36mNumber of Inferences \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m│\n",
"│\u001b[36m \u001b[0m\u001b[36mTotal Inference Time (ms)\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m5042.1868 \u001b[0m\u001b[35m \u001b[0m│\n",
"│\u001b[36m \u001b[0m\u001b[36mAverage Latency (ms) \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m5042.1868 \u001b[0m\u001b[35m \u001b[0m│\n",
"╰───────────────────────────┴────────────────────────────────╯\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"model.print_stats()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "xinfer",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
1 change: 1 addition & 0 deletions xinfer/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .llama32 import Llama32Vision, Llama32VisionInstruct
from .moondream import Moondream
from .qwen2_vl import Qwen2VL
from .smolvlm import SmolVLM
from .vision2seq import Vision2SeqModel
from .vlrm_blip2 import VLRMBlip2

Expand Down
48 changes: 48 additions & 0 deletions xinfer/transformers/smolvlm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from ..model_registry import register_model
from ..models import track_inference
from ..types import ModelInputOutput, Result
from .vision2seq import Vision2SeqModel


@register_model(
"HuggingFaceTB/SmolVLM-Instruct",
"transformers",
ModelInputOutput.IMAGE_TEXT_TO_TEXT,
)
class SmolVLM(Vision2SeqModel):
def __init__(self, model_id: str, device: str = "cpu", dtype: str = "float32"):
super().__init__(model_id, device, dtype)

@track_inference
def infer(self, image: str, text: str, **generate_kwargs) -> Result:
if "max_new_tokens" not in generate_kwargs:
generate_kwargs["max_new_tokens"] = 300

messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": text},
],
}
]

prompt = self.processor.apply_chat_template(
messages, add_generation_prompt=True
)
inputs = self.processor(text=prompt, images=image, return_tensors="pt")
inputs = inputs.to(self.device)

generated_ids = self.model.generate(**inputs, **generate_kwargs)
generated_texts = self.processor.batch_decode(
generated_ids,
skip_special_tokens=True,
)

# Extract only the assistant's response
response = generated_texts[0].split("Assistant:", 1)[-1].strip()
return Result(text=response)

def infer_batch(self, *args, **kwargs):
raise NotImplementedError("SmolVLM does not support batch inference.")
Loading