Merge pull request #40 from bit-bots/fix/rescale_segmentation

fix/rescale_segmentations(...) & rescale_boxes(...)
bit-bots · May 18, 2022 · 8f35de7 · 8f35de7
2 parents 483a7e8 + c045008
commit 8f35de7
Show file tree

Hide file tree

Showing 3 changed files with 120 additions and 80 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "YOEO"
-version = "1.1.1"
+version = "1.2.0"
 description = "A hybrid CNN for object detection and semantic segmentation"
 authors = ["Florian Vahl <[email protected]>", "Jan Gutsche <[email protected]>"]
 

diff --git a/yoeo/detect.py b/yoeo/detect.py
@@ -17,7 +17,7 @@
 from imgaug.augmentables.segmaps import SegmentationMapsOnImage
 
 from yoeo.models import load_model
-from yoeo.utils.utils import load_classes, rescale_boxes, non_max_suppression, print_environment_info, rescale_segmentations
+from yoeo.utils.utils import load_classes, rescale_boxes, non_max_suppression, print_environment_info, rescale_segmentation
 from yoeo.utils.datasets import ImageFolder
 from yoeo.utils.transforms import Resize, DEFAULT_TRANSFORMS
 
@@ -100,7 +100,7 @@ def detect_image(model, image, img_size=416, conf_thres=0.5, nms_thres=0.5):
         detections, segmentations = model(input_img)
         detections = non_max_suppression(detections, conf_thres, nms_thres)
         detections = rescale_boxes(detections[0], img_size, image.shape[0:2])
-        segmentations = rescale_segmentations(segmentations, img_size, image.shape[0:2])
+        segmentations = rescale_segmentation(segmentations, image.shape[0:2])
     return detections.numpy(), segmentations.cpu().detach().numpy()
 
 

diff --git a/yoeo/utils/utils.py b/yoeo/utils/utils.py
@@ -64,99 +64,138 @@ def weights_init_normal(m):
         nn.init.constant_(m.bias.data, 0.0)
 
 
-def rescale_boxes(boxes, current_dim, original_shape):
+def rescale_boxes(boxes, output_img_size, original_img_size):
     """
-    Rescales bounding boxes to the original shape
+    Rescale bounding boxes as if they were calculated on the original, non-padded image.
+    1. bounding boxes are scaled as if they were calculated on the (square) padded original image.
+    2. padding is subtracted, thereby shifting the boxes as if they were calculated on the original,
+       non-padded image.
+
+    :param boxes: detection output
+    :type boxes: torch.Tensor with shape(#boxes, 6)
+    :param output_img_size: size of the image for which the network calculates the bounding boxes (1D)
+    :type output_img_size: int
+    :param original_img_size: size of the original image (height, width)
+    :type original_img_size: Tuple[int, int] (height, width)
+    :return: rescaled detection output
+    :rtype: torch.Tensor with shape(#boxes, 6)
     """
-    orig_h, orig_w = original_shape
-    pad_y, pad_x = calculate_applied_padding_per_dimension(current_dim, original_shape)
-
-    # Image height and width after padding is removed
-    unpad_h = current_dim - pad_y
-    unpad_w = current_dim - pad_x
-
-    # Rescale bounding boxes to dimension of original image
-    boxes[:, 0] = ((boxes[:, 0] - pad_x // 2) / unpad_w) * orig_w
-    boxes[:, 1] = ((boxes[:, 1] - pad_y // 2) / unpad_h) * orig_h
-    boxes[:, 2] = ((boxes[:, 2] - pad_x // 2) / unpad_w) * orig_w
-    boxes[:, 3] = ((boxes[:, 3] - pad_y // 2) / unpad_h) * orig_h
-    return boxes
+
+    rescaled_boxes = rescale_boxes_to_original_padded_img_size(boxes, output_img_size, max(original_img_size))
+    rescaled_boxes = unpad_box_coordinates(rescaled_boxes, original_img_size)
+
+    return rescaled_boxes
 
 
-def calculate_applied_padding_per_dimension(current_dim: int, original_shape: Tuple[int, int]) -> Tuple[int, int]:
+def rescale_boxes_to_original_padded_img_size(boxes, output_img_size: int, original_max_size: int):
     """
-    Calculate the total amount of padding that was added to each image dimension, i. e. 
-    current_dim = original_shape[0] + padding_in_1st_dim = original_shape[1] + padding_in_2nd_dim
-    
-    :param current_dim: segmentation output dimension (1D)
-    :type current_dim: int
-    :param orginal_shape: orginal image shape (2D)
-    :type orgiginal_shape: Tuple[int, int] (height, width)
-    :return: Tuple containing paddings (height, width)
-    :rtype: Tuple[int, int]
+    Rescale bounding boxes as if they were calculated on the (square) padded original image.
+
+    :param boxes: detection output
+    :type boxes: torch.Tensor with shape(#boxes, 6)
+    :param output_img_size: size of the image for which the network calculates the bounding boxes (1D)
+    :type output_img_size: int
+    :param original_max_size: maximum size of the original image (1)
+    :type original_max_size: int
+    :return: rescaled detection output
+    :rtype: torch.Tensor with shape(#boxes, 6)
     """
-    orig_h, orig_w = original_shape
-    pad_w = max(orig_h - orig_w, 0) * (current_dim / max(original_shape))
-    pad_h = max(orig_w - orig_h, 0) * (current_dim / max(original_shape))
-    return int(pad_h), int(pad_w)
+
+    scale_factor = original_max_size / output_img_size
+    boxes[:, 0:4] = boxes[:, 0:4] * scale_factor
+
+    return boxes
 
 
-def rescale_segmentations(segmentations, current_dim: int, original_shape: Tuple[int, int]):
+def unpad_box_coordinates(boxes, original_img_size: Tuple[int, int]):
     """
-    Removes padding and interpolates segmentations to orginal image shape.
-    
-    :param segmentations: YOEO segmentation output
-    :type segmentations: torch.Tensor with shape (1, current_dim, current_dim)
-    :param current_dim: segmentation output dimension (1D)
-    :type current_dim: int
-    :param orginal_shape: orginal image shape (2D)
-    :type orgiginal_shape: Tuple[int, int] (height, width)
+    Subtract padding, thereby shifting the boxes as if they were calculated on the original,
+    non-padded image.
+
+    :param boxes: detection output
+    :type boxes: torch.Tensor with shape(#boxes, 6)
+    :param original_img_size: size of the original image (height, width)
+    :type original_img_size: Tuple[int, int] (height, width)
+    :return: rescaled detection output
+    :rtype: torch.Tensor with shape(#boxes, 6)
     """
-
-    padding = calculate_applied_padding_per_side(current_dim, original_shape)
-    unpadded_segmentations = remove_applied_padding(segmentations, current_dim, padding)
-    return interpolate_to_original_shape(unpadded_segmentations, original_shape)
 
+    padding_left = max(original_img_size[0] - original_img_size[1], 0) // 2
+    padding_top = max(original_img_size[1] - original_img_size[0], 0) // 2
 
-def calculate_applied_padding_per_side(current_dim: int, original_shape: Tuple[int, int]) -> Tuple[int, int]:
-    """
-    Calculate the amount of padding that was added to each side of each image dimension, i. e. 
-    current_dim = padding_in_1st_dim + original_shape[0] + padding_in_1st_dim
-    current_dim = padding_in_2nd_dim + original_shape[1] + padding_in_2nd_dim
-    
-    :param current_dim: segmentation output dimension (1D)
-    :type current_dim: int
-    :param orginal_shape: orginal image shape (2D)
-    :type orgiginal_shape: Tuple[int, int] (height, width)
-    :return: Tuple containing paddings (height, width)
-    :rtype: Tuple[int, int]
+    boxes[:, 0] = boxes[:, 0] - padding_left
+    boxes[:, 1] = boxes[:, 1] - padding_top
+    boxes[:, 2] = boxes[:, 2] - padding_left
+    boxes[:, 3] = boxes[:, 3] - padding_top
+
+    return boxes
+
+
+def rescale_segmentation(segmentation, original_img_size: Tuple[int, int]):
     """
-    pad_h, pad_w = calculate_applied_padding_per_dimension(current_dim, original_shape)
-    return int(pad_h // 2), int(pad_w // 2)
-
-
-def remove_applied_padding(segmentations, current_dim: int, padding: Tuple[int, int]):
+    Interpolate segmentation back to original image size and remove paddings.
+    1. segmentation is rescaled as if it was calculated on the original, padded image size
+    2. paddings are removed, thereby restoring the original image size
+
+    :param segmentation: segmentation output
+    :type segmentation: torch.Tensor with shape (1, height, width) and height == width
+    :param original_img_size: size of the original image (height, width)
+    :type original_img_size: Tuple[int, int] (height, width)
+    :return: rescaled segmentation
+    :rtype: torch.Tensor with shape (1, original_img_size[0], original_img_size[1])
     """
-    :param segmentations: YOEO segmentation output
-    :type segmentations: torch.Tensor with shape (1, current_dim, current_dim)
-    :return: unpadded YOEO segmentation output
-    :rtype: torch.Tensor
+
+    rescaled_seg = rescale_segmentation_to_original_padded_img_size(segmentation, max(original_img_size))
+    rescaled_seg = unpad_segmentation(rescaled_seg, original_img_size)
+
+    return rescaled_seg
+
+
+def rescale_segmentation_to_original_padded_img_size(segmentation, original_max_size: int):
     """
-
-    pad_h, pad_w = padding
-    return segmentations[..., pad_h:current_dim-pad_h, pad_w:current_dim-pad_w]
-
-
-def interpolate_to_original_shape(segmentations, original_shape: Tuple[int, int]):
+    Rescale the segmentation as if it was calculated on the original, padded image size using
+    "nearest-exact" interpolation.
+
+    :param segmentation: segmentation output
+    :type segmentation: torch.Tensor with shape (1, height, width) and height == width
+    :param original_max_size: maximum size of the original image (1)
+    :type original_max_size: int
+    :return: segmentation output with original, padded image size
+    :rtype: torch.Tensor with shape (1, original_max_size, original_max_size)
     """
-    :param segmentations: YOEO segmentation output
-    :type segmentations: torch.Tensor with shape (1, current_dim, current_dim)
-    :return: interpolated YOEO yegmentation output with original image shape
-    :rtype: torch.Tensor with shape (1, *original_shape)
+
+    return nn.functional.interpolate(
+        segmentation.unsqueeze(0),
+        size=(original_max_size, original_max_size),
+        mode="nearest-exact"
+    ).squeeze(0)
+
+
+def unpad_segmentation(segmentation, original_img_size: Tuple[int, int]):
     """
+    Remove paddings, thereby restoring the original image size
     
-    return nn.functional.interpolate(segmentations.unsqueeze(0).type(torch.ByteTensor), size=original_shape, mode="nearest").squeeze(0)
-
+    :param segmentation: segmentation output
+    :type segmentation: torch.Tensor with shape (1, height, width) and height == width
+    :param original_img_size: original image size (height, width)
+    :type original_img_size: Tuple[int, int]
+    :return: unpadded segmentation output
+    :rtype: torch.Tensor with shape (1, original_img_size[0], original_img_size[1])
+    """
+
+    current_size = segmentation.size(dim=1)
+    original_height, original_width = original_img_size
+
+    total_vertical_padding = max(0, original_width - original_height)
+    total_horizontal_padding = max(0, original_height - original_width)
+
+    padding_top = total_vertical_padding // 2
+    padding_bottom = total_vertical_padding - padding_top
+    padding_left = total_horizontal_padding // 2
+    padding_right = total_horizontal_padding - padding_left
+
+    return segmentation[..., padding_top:current_size - padding_bottom, padding_left:current_size - padding_right]
+
 
 def xywh2xyxy(x):
     y = x.new(x.shape)
@@ -291,7 +330,8 @@ def get_batch_statistics(outputs, targets, iou_threshold):
                     continue
 
                 # Filter target_boxes by pred_label so that we only match against boxes of our own label
-                filtered_target_position, filtered_targets = zip(*filter(lambda x: target_labels[x[0]] == pred_label, enumerate(target_boxes)))
+                filtered_target_position, filtered_targets = zip(
+                    *filter(lambda x: target_labels[x[0]] == pred_label, enumerate(target_boxes)))
 
                 # Find the best matching target for our predicted box
                 iou, box_filtered_index = bbox_iou(pred_box.unsqueeze(0), torch.stack(filtered_targets)).max(0)
@@ -454,7 +494,7 @@ def seg_iou(pred, target, classes):
     pred = pred.view(-1)
     target = target.view(-1)
 
-    for cls in range(classes): 
+    for cls in range(classes):
         pred_inds = pred == cls
         target_inds = target == cls
         intersection = (pred_inds[target_inds]).long().sum().data.cpu().item()  # Cast to long to prevent overflows