jsk-ros-pkg · Kanazawanaoaki · Oct 12, 2023 · Oct 15, 2023
diff --git a/doc/jsk_perception/nodes/detection_node.md b/doc/jsk_perception/nodes/detection_node.md
@@ -0,0 +1,89 @@
+# detection_node.py
+
+![](images/dino.png)
+
+The ROS node for Open-Vocabulary Object Detection with GroundingDINO.
+
+## System Configuration
+![](images/large_scale_vil_system.png)
+
+This node requires to work with the Docker Container for inference. Please build the container at first following Setup instruction.
+
+### Prerequisite
+This node requires NVIDIA GPU and more than 4GB GRAM to work properly.
+You have to install nvidia-container-toolkit for using GPU with docker. Please follow [official instruction](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).
+
+### Build the docker image
+You have to build the docker image of GroundingDINO
+
+```shell
+roscd jsk_perception/docker
+make
+```
+
+## Subscribing topic
+* `~image` (`sensor_msgs/Image`)
+
+  Input image
+
+## Publishing topic
+* `~output/image` (`sensor_msgs/Image`)
+
+  Image drawing the detected bounding box
+
+* `~rects` (`jsk_recognition_msgs/RectArray`)
+
+  Array of detected bounding box regions
+
+* `~result` (`jsk_recognition_msgs/DetectionResult`)
+
+  Detection result
+
+* `~result/image` (`sensor_msgs/Image`)
+
+  Images used for inference
+
+* `~visualize` (`std_msgs/String`)
+
+  Detection result to visualize
+
+## Action topic
+* `~inference_server/goal` (`jsk_recognition_msgs/DetectionTaskActionGoal`) 
+
+  Detection request with custom categories and image
+
+* `~inference_server/result` (`jsk_recognition_msgs/DetectionTaskActionResult`)
+
+  Detection result of `~inference_server/goal`
+
+## Parameters
+* `~host` (String, default: `localhost`)
+
+  The host name or IP of inference container 
+
+* `~port` (Integer, default: `8080`)
+
+  The HTTP port of inference container
+
+## Dynamic Reconfigure Parameters
+* `~queries` (string, default: `human;kettle;cup;glass`) 
+
+  Default categories used for subscribing image topic.
+
+### Run inference container on another host or another terminal
+In the remote GPU machine,
+```shell
+cd jsk_recognition/jsk_perception/docker
+./run_jsk_vil_api dino --port (Your vacant port)
+```
+
+In the ROS machine,
+```shell
+roslaunch jsk_perception detection.launch port:=(Your inference container port) host:=(Your inference container host) DETECTION_INPUT_IMAGE:=(Your image topic name) gui:=true 
+```
+
+
+### Run both inference container and ros node in single host 
+```
+roslaunch jsk_perception detection.launch run_api:=true DETECTION_INPUT_IMAGE:=(Your image topic name) gui:=true 
+```
diff --git a/doc/jsk_perception/nodes/images/dino.png b/doc/jsk_perception/nodes/images/dino.png
diff --git a/jsk_perception/docker/Makefile b/jsk_perception/docker/Makefile
@@ -5,9 +5,11 @@
 # api directories
 OFAPROJECT = ofa
 CLIPPROJECT = clip
+DINOPROJECT = dino
 # image names
 OFAIMAGE = jsk-ofa-server
 CLIPIMAGE = jsk-clip-server
+DINOIMAGE = jsk-dino-server
 # commands
 BUILDIMAGE = docker build
 REMOVEIMAGE = docker rmi
@@ -23,7 +25,7 @@ PARAMURLS = parameter_urls.txt
 # OFA parameters
 OFAPARAMFILES = $(foreach param, $(OFAPARAMS), $(PARAMDIR)/$(param))
 
-all: ofa clip
+all: ofa clip dino
 
 # TODO check command wget exists, nvidia-driver version
 
@@ -41,11 +43,14 @@ ofa: $(PARAMDIR)/.download
 clip: $(PARAMDIR)/.download
 	$(BUILDIMAGE) $(CLIPPROJECT) -t $(CLIPIMAGE) -f $(CLIPPROJECT)/Dockerfile
 
+dino: $(PARAMDIR)/.download
+	$(BUILDIMAGE) $(DINOPROJECT) -t $(DINOIMAGE) -f $(DINOPROJECT)/Dockerfile
+
 # TODO add clip, glip
 clean:
 	@$(REMOVEIMAGE) $(OFAIMAGE)
 
 wipe: clean
 	rm -fr $(PARAMDIR)
 
-.PHONY: clean wipe ofa clip
+.PHONY: clean wipe ofa clip dino
diff --git a/jsk_perception/docker/dino/Dockerfile b/jsk_perception/docker/dino/Dockerfile
@@ -0,0 +1,27 @@
+# FROM pytorch/pytorch:1.7.1-cuda11.0-cudnn8-devel
+FROM pytorch/pytorch:1.9.1-cuda11.1-cudnn8-devel
+# FROm pytorch/pytorch:1.13.1-cuda11.6-cudnn8-devel
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt -o Acquire::AllowInsecureRepositories=true update \
+    && apt-get install -y \
+    curl \
+    git \
+    libopencv-dev \
+    wget \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+ENV CUDA_HOME /usr/local/cuda
+ENV TORCH_CUDA_ARCH_LIST 8.0+PTX
+RUN git clone https://github.com/IDEA-Research/GroundingDINO.git
+RUN echo 'export CUDA_HOME=/usr/local/cuda' >> ~/.bashrc
+RUN echo 'TORCH_CUDA_ARCH_LIST=8.0+PTX' >> ~/.bashrc
+RUN pip install flask opencv-python \
+    && pip install "numpy>=1.20"
+RUN cd GroundingDINO \
+    && pip install -r requirements.txt \
+    && pip install -e .
+RUN mkdir -p GroundingDINO/weights \
+    && cd GroundingDINO/weights \
+    && wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
+COPY server.py /workspace/GroundingDINO
+ENTRYPOINT cd /workspace/GroundingDINO && python server.py
diff --git a/jsk_perception/docker/dino/server.py b/jsk_perception/docker/dino/server.py
@@ -0,0 +1,99 @@
+from groundingdino.util.inference import load_model, load_image, predict, annotate
+import groundingdino.datasets.transforms as T
+from torchvision.ops import box_convert
+
+import cv2
+import numpy as np
+from PIL import Image as PLImage
+import torch
+
+# web server
+from flask import Flask, request, Response
+import json
+import base64
+
+
+def apply_half(t):
+    if t.dtype is torch.float32:
+        return t.to(dtype=torch.half)
+    return t
+
+class Inference:
+    def __init__(self, gpu_id=None):
+        self.gpu_id = gpu_id
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "weights/groundingdino_swint_ogc.pth")
+        self.BOX_TRESHOLD = 0.35
+        self.TEXT_TRESHOLD = 0.25
+
+    def convert_to_string(self, input_list):
+        output_string = ""
+        for item in input_list:
+            output_string += item + " . "
+        return output_string.strip()
+
+    def infer(self, img, texts):
+        # get cv2 image
+        # image = cv2.resize(img, dsize=(640, 480)) # NOTE forcely
+        # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        image_source = PLImage.fromarray(image)
+        image = np.asarray(image_source)
+        transform = T.Compose(
+            [
+                T.RandomResize([800], max_size=1333),
+                T.ToTensor(),
+                T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+            ]
+        )
+        image_transformed, _ = transform(image_source, None)
+
+        image_source = image
+        image = image_transformed
+
+        TEXT_PROMPT = self.convert_to_string(texts)
+
+        boxes, logits, phrases = predict(
+            model=self.model,
+            image=image,
+            caption=TEXT_PROMPT,
+            box_threshold=self.BOX_TRESHOLD,
+            text_threshold=self.TEXT_TRESHOLD,
+            device = self.device
+        )
+
+        h, w, _ = image_source.shape
+        boxes = boxes * torch.Tensor([w, h, w, h])
+        xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
+
+        results = {}
+        for i in range(len(xyxy)):
+            box = xyxy[i].tolist()
+            logit = logits[i].item()
+            results[i] = {"box": box, "logit": logit, "phrase": phrases[i]}
+
+        return results
+
+# run
+if __name__ == "__main__":
+    app = Flask(__name__)
+    infer = Inference()
+
+    @app.route("/detection", methods=['POST'])
+    def detection_request():
+        data = request.data.decode("utf-8")
+        data_json = json.loads(data)
+        # process image
+        image_b = data_json['image']
+        image_dec = base64.b64decode(image_b)
+        data_np = np.fromstring(image_dec, dtype='uint8')
+        img = cv2.imdecode(data_np, 1)
+        # get text
+        texts = data_json['queries']
+        infer_results = infer.infer(img, texts)
+        results = []
+        for i in range(len(infer_results)):
+            results.append({"id": i, "box": infer_results[i]["box"], "logit": infer_results[i]["logit"], "phrase": infer_results[i]["phrase"]})
+        return Response(response=json.dumps({"results": results}), status=200)
+
+    app.run("0.0.0.0", 8080, threaded=True)
diff --git a/jsk_perception/docker/run_jsk_vil_api b/jsk_perception/docker/run_jsk_vil_api
@@ -10,7 +10,8 @@ import subprocess
 import sys
 
 CONTAINERS = {"ofa": "jsk-ofa-server",
-              "clip": "jsk-clip-server"}
+              "clip": "jsk-clip-server",
+              "dino": "jsk-dino-server"}
 OFA_MODEL_SCALES = ["base", "large", "huge"]
 
 parser = argparse.ArgumentParser(description="JSK Vision and Language API runner")

diff --git a/jsk_perception/launch/detection.launch b/jsk_perception/launch/detection.launch
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="utf-8"?>
+<launch>
+  <arg name="host" default="localhost" />
+  <arg name="port" default="8888" />
+  <arg name="gui" default="false" />
+  <arg name="run_api" default="false" />
+  <arg name="model" default="dino" />
+  <arg name="DETECTION_INPUT_IMAGE" default="image" />
+
+  <node name="detection_api" pkg="jsk_perception" type="run_jsk_vil_api" output="log"
+        args="$(arg model) -p $(arg port)" if="$(arg run_api)" />
+
+  <node name="detection" pkg="jsk_perception" type="detection_node.py" output="screen">
+    <remap from="~image" to="$(arg DETECTION_INPUT_IMAGE)" />
+    <rosparam subst_value="true">
+      host: $(arg host)
+      port: $(arg port)
+      model: $(arg model)
+    </rosparam>
+  </node>
+
+  <include file="$(find jsk_perception)/launch/ofa_gui.launch" if="$(arg gui)" />
+
+</launch>
diff --git a/jsk_perception/node_scripts/detection_node.py b/jsk_perception/node_scripts/detection_node.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+
+import rospy
+from jsk_perception.vil_inference_client import DINOClientNode
+
+
+def main():
+    rospy.init_node("dino")
+    node = DINOClientNode()
+    rospy.spin()
+
+if __name__ == "__main__":
+    main()
diff --git a/jsk_perception/sample/config/sample_ofa_config.rviz b/jsk_perception/sample/config/sample_ofa_config.rviz
@@ -5,7 +5,7 @@ Panels:
     Property Tree Widget:
       Expanded: ~
       Splitter Ratio: 0.4870370328426361
-    Tree Height: 509
+    Tree Height: 625
   - Class: rviz/Selection
     Name: Selection
   - Class: rviz/Tool Properties
@@ -113,6 +113,38 @@ Visualization Manager:
       text size: 12
       top: 320
       width: 512
+    - Class: jsk_rviz_plugin/OverlayImage
+      Enabled: true
+      Name: ObjectDetection/Output/Image
+      Topic: /detection/output/image
+      Value: true
+      alpha: 0.800000011920929
+      height: 128
+      keep aspect ratio: true
+      left: 530
+      overwrite alpha value: false
+      top: 10
+      transport hint: raw
+      width: 320
+    - Align Bottom: false
+      Background Alpha: 0.800000011920929
+      Background Color: 0; 0; 0
+      Class: jsk_rviz_plugin/String
+      Enabled: true
+      Foreground Alpha: 0.800000011920929
+      Foreground Color: 255; 255; 255
+      Name: ObjectDetection/Visualize
+      Overtake Color Properties: true
+      Overtake Position Properties: true
+      Topic: /detection/visualize
+      Value: true
+      font: DejaVu Sans Mono
+      height: 500
+      left: 530
+      line width: 2
+      text size: 12
+      top: 320
+      width: 512
   Enabled: true
   Global Options:
     Background Color: 48; 48; 48
@@ -164,10 +196,10 @@ Visualization Manager:
 Window Geometry:
   Displays:
     collapsed: false
-  Height: 1025
+  Height: 1016
   Hide Left Dock: false
   Hide Right Dock: true
-  QMainWindow State: 000000ff00000000fd00000004000000000000021e000002f7fc0200000008fb0000001200530065006c0065006300740069006f006e00000001e10000009b000000b100fffffffb0000001e0054006f006f006c002000500072006f007000650072007400690065007302000001ed000001df00000185000000b1fb000000120056006900650077007300200054006f006f02000001df000002110000018500000122fb000000200054006f006f006c002000500072006f0070006500720074006900650073003203000002880000011d000002210000017afb000000100044006900730070006c006100790073010000006f000002f70000018400fffffffb0000002000730065006c0065006300740069006f006e00200062007500660066006500720200000138000000aa0000023a00000294fb00000014005700690064006500530074006500720065006f02000000e6000000d2000003ee0000030bfb0000000c004b0069006e0065006300740200000186000001060000030c00000261000000010000010f0000037efc0200000003fb0000001e0054006f006f006c002000500072006f00700065007200740069006500730100000041000000780000000000000000fb0000000a00560069006500770073000000003d0000037e0000013500fffffffb0000001200530065006c0065006300740069006f006e010000025a000000b200000000000000000000000200000490000000a9fc0100000001fb0000000a00560069006500770073030000004e00000080000002e100000197000000030000078000000060fc0100000002fb0000000800540069006d0065010000000000000780000005cd00fffffffb0000000800540069006d0065010000000000000450000000000000000000000556000002f700000004000000040000000800000008fc0000000100000002000000010000000a0054006f006f006c00730100000000ffffffff0000000000000000
+  QMainWindow State: 000000ff00000000fd00000004000000000000021e00000338fc0200000008fb0000001200530065006c0065006300740069006f006e00000001e10000009b0000005c00fffffffb0000001e0054006f006f006c002000500072006f007000650072007400690065007302000001ed000001df00000185000000b1fb000000120056006900650077007300200054006f006f02000001df000002110000018500000122fb000000200054006f006f006c002000500072006f0070006500720074006900650073003203000002880000011d000002210000017afb000000100044006900730070006c006100790073010000003d00000338000000c900fffffffb0000002000730065006c0065006300740069006f006e00200062007500660066006500720200000138000000aa0000023a00000294fb00000014005700690064006500530074006500720065006f02000000e6000000d2000003ee0000030bfb0000000c004b0069006e0065006300740200000186000001060000030c00000261000000010000010f0000037efc0200000003fb0000001e0054006f006f006c002000500072006f00700065007200740069006500730100000041000000780000000000000000fb0000000a00560069006500770073000000003d0000037e000000a400fffffffb0000001200530065006c0065006300740069006f006e010000025a000000b200000000000000000000000200000490000000a9fc0100000001fb0000000a00560069006500770073030000004e00000080000002e100000197000000030000073800000060fc0100000002fb0000000800540069006d0065010000000000000738000003bc00fffffffb0000000800540069006d00650100000000000004500000000000000000000005140000033800000004000000040000000800000008fc0000000100000002000000010000000a0054006f006f006c00730100000000ffffffff0000000000000000
   Selection:
     collapsed: false
   Time:
@@ -176,6 +208,6 @@ Window Geometry:
     collapsed: false
   Views:
     collapsed: true
-  Width: 1920
-  X: 1440
-  Y: 1096
+  Width: 1848
+  X: 72
+  Y: 27