cvat/serverless/pytorch/shiyinzhang/iog/nuclio/model_handler.py

# Copyright (C) 2020-2022 Intel Corporation
# Copyright (C) CVAT.ai Corporation
#
# SPDX-License-Identifier: MIT

import numpy as np
import os
import cv2
import torch
from networks.mainnetwork import Network
from dataloaders import helpers

class ModelHandler:
    def __init__(self):
        base_dir = os.environ.get("MODEL_PATH", "/opt/nuclio/iog")
        model_path = os.path.join(base_dir, "IOG_PASCAL_SBD.pth")
        self.device = torch.device("cpu")

        # Number of input channels (RGB + heatmap of IOG points)
        self.net = Network(nInputChannels=5, num_classes=1, backbone='resnet101',
            output_stride=16, sync_bn=None, freeze_bn=False)

        pretrain_dict = torch.load(model_path, weights_only=True)
        self.net.load_state_dict(pretrain_dict)
        self.net.to(self.device)
        self.net.eval()

    def handle(self, image, bbox, pos_points, neg_points, threshold):
        with torch.no_grad():
            # extract a crop with padding from the image
            crop_padding = 30
            crop_bbox = [
                max(bbox[0][0] - crop_padding, 0),
                max(bbox[0][1] - crop_padding, 0),
                min(bbox[1][0] + crop_padding, image.width - 1),
                min(bbox[1][1] + crop_padding, image.height - 1)
            ]
            crop_shape = (
                int(crop_bbox[2] - crop_bbox[0] + 1), # width
                int(crop_bbox[3] - crop_bbox[1] + 1), # height
            )

            # try to use crop_from_bbox(img, bbox, zero_pad) here
            input_crop = np.array(image.crop(crop_bbox)).astype(np.float32)

            # resize the crop
            input_crop = cv2.resize(input_crop, (512, 512), interpolation=cv2.INTER_NEAREST)
            crop_scale = (512 / crop_shape[0], 512 / crop_shape[1])

            def translate_points_to_crop(points):
                points = [
                    ((p[0] - crop_bbox[0]) * crop_scale[0], # x
                     (p[1] - crop_bbox[1]) * crop_scale[1]) # y
                    for p in points]

                return points

            pos_points = translate_points_to_crop(pos_points)
            neg_points = translate_points_to_crop(neg_points)

            # Create IOG image
            pos_gt = np.zeros(shape=input_crop.shape[:2], dtype=np.float64)
            neg_gt = np.zeros(shape=input_crop.shape[:2], dtype=np.float64)
            for p in pos_points:
                pos_gt = np.maximum(pos_gt, helpers.make_gaussian(pos_gt.shape, center=p))
            for p in neg_points:
                neg_gt = np.maximum(neg_gt, helpers.make_gaussian(neg_gt.shape, center=p))
            iog_image = np.stack((pos_gt, neg_gt), axis=2).astype(dtype=input_crop.dtype)

            # Convert iog_image to an image (0-255 values)
            cv2.normalize(iog_image, iog_image, 0, 255, cv2.NORM_MINMAX)

            # Concatenate input crop and IOG image
            input_blob = np.concatenate((input_crop, iog_image), axis=2)

            # numpy image: H x W x C
            # torch image: C X H X W
            input_blob = input_blob.transpose((2, 0, 1))
            # batch size is 1
            input_blob = np.array([input_blob])
            input_tensor = torch.from_numpy(input_blob)

            input_tensor = input_tensor.to(self.device)
            output_mask = self.net.forward(input_tensor)[4]
            output_mask = output_mask.to(self.device)
            pred = np.transpose(output_mask.data.numpy()[0, :, :, :], (1, 2, 0))
            pred = pred > threshold
            pred = np.squeeze(pred)

            # Convert a mask to a polygon
            pred = np.array(pred, dtype=np.uint8)
            pred = cv2.resize(pred, dsize=(crop_shape[0], crop_shape[1]),
                interpolation=cv2.INTER_CUBIC)
            cv2.normalize(pred, pred, 0, 255, cv2.NORM_MINMAX)

            mask = np.zeros((image.height, image.width), dtype=np.uint8)
            x = int(crop_bbox[0])
            y = int(crop_bbox[1])
            mask[y : y + crop_shape[1], x : x + crop_shape[0]] = pred

            return mask
继承于2.44.3版本 2025-09-16 01:19:40 +00:00			`# Copyright (C) 2020-2022 Intel Corporation`
			`# Copyright (C) CVAT.ai Corporation`
			`#`
			`# SPDX-License-Identifier: MIT`

			`import numpy as np`
			`import os`
			`import cv2`
			`import torch`
			`from networks.mainnetwork import Network`
			`from dataloaders import helpers`

			`class ModelHandler:`
			`def __init__(self):`
			`base_dir = os.environ.get("MODEL_PATH", "/opt/nuclio/iog")`
			`model_path = os.path.join(base_dir, "IOG_PASCAL_SBD.pth")`
			`self.device = torch.device("cpu")`

			`# Number of input channels (RGB + heatmap of IOG points)`
			`self.net = Network(nInputChannels=5, num_classes=1, backbone='resnet101',`
			`output_stride=16, sync_bn=None, freeze_bn=False)`

			`pretrain_dict = torch.load(model_path, weights_only=True)`
			`self.net.load_state_dict(pretrain_dict)`
			`self.net.to(self.device)`
			`self.net.eval()`

			`def handle(self, image, bbox, pos_points, neg_points, threshold):`
			`with torch.no_grad():`
			`# extract a crop with padding from the image`
			`crop_padding = 30`
			`crop_bbox = [`
			`max(bbox[0][0] - crop_padding, 0),`
			`max(bbox[0][1] - crop_padding, 0),`
			`min(bbox[1][0] + crop_padding, image.width - 1),`
			`min(bbox[1][1] + crop_padding, image.height - 1)`
			`]`
			`crop_shape = (`
			`int(crop_bbox[2] - crop_bbox[0] + 1), # width`
			`int(crop_bbox[3] - crop_bbox[1] + 1), # height`
			`)`

			`# try to use crop_from_bbox(img, bbox, zero_pad) here`
			`input_crop = np.array(image.crop(crop_bbox)).astype(np.float32)`

			`# resize the crop`
			`input_crop = cv2.resize(input_crop, (512, 512), interpolation=cv2.INTER_NEAREST)`
			`crop_scale = (512 / crop_shape[0], 512 / crop_shape[1])`

			`def translate_points_to_crop(points):`
			`points = [`
			`((p[0] - crop_bbox[0]) * crop_scale[0], # x`
			`(p[1] - crop_bbox[1]) * crop_scale[1]) # y`
			`for p in points]`

			`return points`

			`pos_points = translate_points_to_crop(pos_points)`
			`neg_points = translate_points_to_crop(neg_points)`

			`# Create IOG image`
			`pos_gt = np.zeros(shape=input_crop.shape[:2], dtype=np.float64)`
			`neg_gt = np.zeros(shape=input_crop.shape[:2], dtype=np.float64)`
			`for p in pos_points:`
			`pos_gt = np.maximum(pos_gt, helpers.make_gaussian(pos_gt.shape, center=p))`
			`for p in neg_points:`
			`neg_gt = np.maximum(neg_gt, helpers.make_gaussian(neg_gt.shape, center=p))`
			`iog_image = np.stack((pos_gt, neg_gt), axis=2).astype(dtype=input_crop.dtype)`

			`# Convert iog_image to an image (0-255 values)`
			`cv2.normalize(iog_image, iog_image, 0, 255, cv2.NORM_MINMAX)`

			`# Concatenate input crop and IOG image`
			`input_blob = np.concatenate((input_crop, iog_image), axis=2)`

			`# numpy image: H x W x C`
			`# torch image: C X H X W`
			`input_blob = input_blob.transpose((2, 0, 1))`
			`# batch size is 1`
			`input_blob = np.array([input_blob])`
			`input_tensor = torch.from_numpy(input_blob)`

			`input_tensor = input_tensor.to(self.device)`
			`output_mask = self.net.forward(input_tensor)[4]`
			`output_mask = output_mask.to(self.device)`
			`pred = np.transpose(output_mask.data.numpy()[0, :, :, :], (1, 2, 0))`
			`pred = pred > threshold`
			`pred = np.squeeze(pred)`

			`# Convert a mask to a polygon`
			`pred = np.array(pred, dtype=np.uint8)`
			`pred = cv2.resize(pred, dsize=(crop_shape[0], crop_shape[1]),`
			`interpolation=cv2.INTER_CUBIC)`
			`cv2.normalize(pred, pred, 0, 255, cv2.NORM_MINMAX)`

			`mask = np.zeros((image.height, image.width), dtype=np.uint8)`
			`x = int(crop_bbox[0])`
			`y = int(crop_bbox[1])`
			`mask[y : y + crop_shape[1], x : x + crop_shape[0]] = pred`

			`return mask`