support gsam2 image predictor model
This commit is contained in:
0
grounding_dino/groundingdino/datasets/__init__.py
Normal file
0
grounding_dino/groundingdino/datasets/__init__.py
Normal file
269
grounding_dino/groundingdino/datasets/cocogrounding_eval.py
Normal file
269
grounding_dino/groundingdino/datasets/cocogrounding_eval.py
Normal file
@@ -0,0 +1,269 @@
|
||||
# ------------------------------------------------------------------------
|
||||
# Grounding DINO. Midified by Shilong Liu.
|
||||
# url: https://github.com/IDEA-Research/GroundingDINO
|
||||
# Copyright (c) 2023 IDEA. All Rights Reserved.
|
||||
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
|
||||
# ------------------------------------------------------------------------
|
||||
# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
"""
|
||||
COCO evaluator that works in distributed mode.
|
||||
|
||||
Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
|
||||
The difference is that there is less copy-pasting from pycocotools
|
||||
in the end of the file, as python3 can suppress prints with contextlib
|
||||
"""
|
||||
import contextlib
|
||||
import copy
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pycocotools.mask as mask_util
|
||||
import torch
|
||||
from pycocotools.coco import COCO
|
||||
from pycocotools.cocoeval import COCOeval
|
||||
|
||||
from groundingdino.util.misc import all_gather
|
||||
|
||||
|
||||
class CocoGroundingEvaluator(object):
|
||||
def __init__(self, coco_gt, iou_types, useCats=True):
|
||||
assert isinstance(iou_types, (list, tuple))
|
||||
coco_gt = copy.deepcopy(coco_gt)
|
||||
self.coco_gt = coco_gt
|
||||
|
||||
self.iou_types = iou_types
|
||||
self.coco_eval = {}
|
||||
for iou_type in iou_types:
|
||||
self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
|
||||
self.coco_eval[iou_type].useCats = useCats
|
||||
|
||||
self.img_ids = []
|
||||
self.eval_imgs = {k: [] for k in iou_types}
|
||||
self.useCats = useCats
|
||||
|
||||
def update(self, predictions):
|
||||
img_ids = list(np.unique(list(predictions.keys())))
|
||||
self.img_ids.extend(img_ids)
|
||||
|
||||
for iou_type in self.iou_types:
|
||||
results = self.prepare(predictions, iou_type)
|
||||
|
||||
# suppress pycocotools prints
|
||||
with open(os.devnull, "w") as devnull:
|
||||
with contextlib.redirect_stdout(devnull):
|
||||
coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
|
||||
|
||||
coco_eval = self.coco_eval[iou_type]
|
||||
|
||||
coco_eval.cocoDt = coco_dt
|
||||
coco_eval.params.imgIds = list(img_ids)
|
||||
coco_eval.params.useCats = self.useCats
|
||||
img_ids, eval_imgs = evaluate(coco_eval)
|
||||
|
||||
self.eval_imgs[iou_type].append(eval_imgs)
|
||||
|
||||
def synchronize_between_processes(self):
|
||||
for iou_type in self.iou_types:
|
||||
self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
|
||||
create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
|
||||
|
||||
def accumulate(self):
|
||||
for coco_eval in self.coco_eval.values():
|
||||
coco_eval.accumulate()
|
||||
|
||||
def summarize(self):
|
||||
for iou_type, coco_eval in self.coco_eval.items():
|
||||
print("IoU metric: {}".format(iou_type))
|
||||
coco_eval.summarize()
|
||||
|
||||
def prepare(self, predictions, iou_type):
|
||||
if iou_type == "bbox":
|
||||
return self.prepare_for_coco_detection(predictions)
|
||||
elif iou_type == "segm":
|
||||
return self.prepare_for_coco_segmentation(predictions)
|
||||
elif iou_type == "keypoints":
|
||||
return self.prepare_for_coco_keypoint(predictions)
|
||||
else:
|
||||
raise ValueError("Unknown iou type {}".format(iou_type))
|
||||
|
||||
def prepare_for_coco_detection(self, predictions):
|
||||
coco_results = []
|
||||
for original_id, prediction in predictions.items():
|
||||
if len(prediction) == 0:
|
||||
continue
|
||||
|
||||
boxes = prediction["boxes"]
|
||||
boxes = convert_to_xywh(boxes).tolist()
|
||||
scores = prediction["scores"].tolist()
|
||||
labels = prediction["labels"].tolist()
|
||||
|
||||
coco_results.extend(
|
||||
[
|
||||
{
|
||||
"image_id": original_id,
|
||||
"category_id": labels[k],
|
||||
"bbox": box,
|
||||
"score": scores[k],
|
||||
}
|
||||
for k, box in enumerate(boxes)
|
||||
]
|
||||
)
|
||||
return coco_results
|
||||
|
||||
def prepare_for_coco_segmentation(self, predictions):
|
||||
coco_results = []
|
||||
for original_id, prediction in predictions.items():
|
||||
if len(prediction) == 0:
|
||||
continue
|
||||
|
||||
scores = prediction["scores"]
|
||||
labels = prediction["labels"]
|
||||
masks = prediction["masks"]
|
||||
|
||||
masks = masks > 0.5
|
||||
|
||||
scores = prediction["scores"].tolist()
|
||||
labels = prediction["labels"].tolist()
|
||||
|
||||
rles = [
|
||||
mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
|
||||
for mask in masks
|
||||
]
|
||||
for rle in rles:
|
||||
rle["counts"] = rle["counts"].decode("utf-8")
|
||||
|
||||
coco_results.extend(
|
||||
[
|
||||
{
|
||||
"image_id": original_id,
|
||||
"category_id": labels[k],
|
||||
"segmentation": rle,
|
||||
"score": scores[k],
|
||||
}
|
||||
for k, rle in enumerate(rles)
|
||||
]
|
||||
)
|
||||
return coco_results
|
||||
|
||||
def prepare_for_coco_keypoint(self, predictions):
|
||||
coco_results = []
|
||||
for original_id, prediction in predictions.items():
|
||||
if len(prediction) == 0:
|
||||
continue
|
||||
|
||||
boxes = prediction["boxes"]
|
||||
boxes = convert_to_xywh(boxes).tolist()
|
||||
scores = prediction["scores"].tolist()
|
||||
labels = prediction["labels"].tolist()
|
||||
keypoints = prediction["keypoints"]
|
||||
keypoints = keypoints.flatten(start_dim=1).tolist()
|
||||
|
||||
coco_results.extend(
|
||||
[
|
||||
{
|
||||
"image_id": original_id,
|
||||
"category_id": labels[k],
|
||||
"keypoints": keypoint,
|
||||
"score": scores[k],
|
||||
}
|
||||
for k, keypoint in enumerate(keypoints)
|
||||
]
|
||||
)
|
||||
return coco_results
|
||||
|
||||
|
||||
def convert_to_xywh(boxes):
|
||||
xmin, ymin, xmax, ymax = boxes.unbind(1)
|
||||
return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
|
||||
|
||||
|
||||
def merge(img_ids, eval_imgs):
|
||||
all_img_ids = all_gather(img_ids)
|
||||
all_eval_imgs = all_gather(eval_imgs)
|
||||
|
||||
merged_img_ids = []
|
||||
for p in all_img_ids:
|
||||
merged_img_ids.extend(p)
|
||||
|
||||
merged_eval_imgs = []
|
||||
for p in all_eval_imgs:
|
||||
merged_eval_imgs.append(p)
|
||||
|
||||
merged_img_ids = np.array(merged_img_ids)
|
||||
merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
|
||||
|
||||
# keep only unique (and in sorted order) images
|
||||
merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
|
||||
merged_eval_imgs = merged_eval_imgs[..., idx]
|
||||
|
||||
return merged_img_ids, merged_eval_imgs
|
||||
|
||||
|
||||
def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
|
||||
img_ids, eval_imgs = merge(img_ids, eval_imgs)
|
||||
img_ids = list(img_ids)
|
||||
eval_imgs = list(eval_imgs.flatten())
|
||||
|
||||
coco_eval.evalImgs = eval_imgs
|
||||
coco_eval.params.imgIds = img_ids
|
||||
coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
|
||||
|
||||
|
||||
#################################################################
|
||||
# From pycocotools, just removed the prints and fixed
|
||||
# a Python3 bug about unicode not defined
|
||||
#################################################################
|
||||
|
||||
|
||||
def evaluate(self):
|
||||
"""
|
||||
Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
|
||||
:return: None
|
||||
"""
|
||||
# tic = time.time()
|
||||
# print('Running per image evaluation...')
|
||||
p = self.params
|
||||
# add backward compatibility if useSegm is specified in params
|
||||
if p.useSegm is not None:
|
||||
p.iouType = "segm" if p.useSegm == 1 else "bbox"
|
||||
print("useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType))
|
||||
# print('Evaluate annotation type *{}*'.format(p.iouType))
|
||||
p.imgIds = list(np.unique(p.imgIds))
|
||||
if p.useCats:
|
||||
p.catIds = list(np.unique(p.catIds))
|
||||
p.maxDets = sorted(p.maxDets)
|
||||
self.params = p
|
||||
|
||||
self._prepare()
|
||||
# loop through images, area range, max detection number
|
||||
catIds = p.catIds if p.useCats else [-1]
|
||||
|
||||
if p.iouType == "segm" or p.iouType == "bbox":
|
||||
computeIoU = self.computeIoU
|
||||
elif p.iouType == "keypoints":
|
||||
computeIoU = self.computeOks
|
||||
self.ious = {
|
||||
(imgId, catId): computeIoU(imgId, catId)
|
||||
for imgId in p.imgIds
|
||||
for catId in catIds}
|
||||
|
||||
evaluateImg = self.evaluateImg
|
||||
maxDet = p.maxDets[-1]
|
||||
evalImgs = [
|
||||
evaluateImg(imgId, catId, areaRng, maxDet)
|
||||
for catId in catIds
|
||||
for areaRng in p.areaRng
|
||||
for imgId in p.imgIds
|
||||
]
|
||||
# this is NOT in the pycocotools code, but could be done outside
|
||||
evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
|
||||
self._paramsEval = copy.deepcopy(self.params)
|
||||
# toc = time.time()
|
||||
# print('DONE (t={:0.2f}s).'.format(toc-tic))
|
||||
return p.imgIds, evalImgs
|
||||
|
||||
|
||||
#################################################################
|
||||
# end of straight copy from pycocotools, just removing the prints
|
||||
#################################################################
|
311
grounding_dino/groundingdino/datasets/transforms.py
Normal file
311
grounding_dino/groundingdino/datasets/transforms.py
Normal file
@@ -0,0 +1,311 @@
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
"""
|
||||
Transforms and data augmentation for both image + bbox.
|
||||
"""
|
||||
import os
|
||||
import random
|
||||
|
||||
import PIL
|
||||
import torch
|
||||
import torchvision.transforms as T
|
||||
import torchvision.transforms.functional as F
|
||||
|
||||
from grounding_dino.groundingdino.util.box_ops import box_xyxy_to_cxcywh
|
||||
from grounding_dino.groundingdino.util.misc import interpolate
|
||||
|
||||
|
||||
def crop(image, target, region):
|
||||
cropped_image = F.crop(image, *region)
|
||||
|
||||
target = target.copy()
|
||||
i, j, h, w = region
|
||||
|
||||
# should we do something wrt the original size?
|
||||
target["size"] = torch.tensor([h, w])
|
||||
|
||||
fields = ["labels", "area", "iscrowd", "positive_map"]
|
||||
|
||||
if "boxes" in target:
|
||||
boxes = target["boxes"]
|
||||
max_size = torch.as_tensor([w, h], dtype=torch.float32)
|
||||
cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
|
||||
cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
|
||||
cropped_boxes = cropped_boxes.clamp(min=0)
|
||||
area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
|
||||
target["boxes"] = cropped_boxes.reshape(-1, 4)
|
||||
target["area"] = area
|
||||
fields.append("boxes")
|
||||
|
||||
if "masks" in target:
|
||||
# FIXME should we update the area here if there are no boxes?
|
||||
target["masks"] = target["masks"][:, i : i + h, j : j + w]
|
||||
fields.append("masks")
|
||||
|
||||
# remove elements for which the boxes or masks that have zero area
|
||||
if "boxes" in target or "masks" in target:
|
||||
# favor boxes selection when defining which elements to keep
|
||||
# this is compatible with previous implementation
|
||||
if "boxes" in target:
|
||||
cropped_boxes = target["boxes"].reshape(-1, 2, 2)
|
||||
keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
|
||||
else:
|
||||
keep = target["masks"].flatten(1).any(1)
|
||||
|
||||
for field in fields:
|
||||
if field in target:
|
||||
target[field] = target[field][keep]
|
||||
|
||||
if os.environ.get("IPDB_SHILONG_DEBUG", None) == "INFO":
|
||||
# for debug and visualization only.
|
||||
if "strings_positive" in target:
|
||||
target["strings_positive"] = [
|
||||
_i for _i, _j in zip(target["strings_positive"], keep) if _j
|
||||
]
|
||||
|
||||
return cropped_image, target
|
||||
|
||||
|
||||
def hflip(image, target):
|
||||
flipped_image = F.hflip(image)
|
||||
|
||||
w, h = image.size
|
||||
|
||||
target = target.copy()
|
||||
if "boxes" in target:
|
||||
boxes = target["boxes"]
|
||||
boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor(
|
||||
[w, 0, w, 0]
|
||||
)
|
||||
target["boxes"] = boxes
|
||||
|
||||
if "masks" in target:
|
||||
target["masks"] = target["masks"].flip(-1)
|
||||
|
||||
return flipped_image, target
|
||||
|
||||
|
||||
def resize(image, target, size, max_size=None):
|
||||
# size can be min_size (scalar) or (w, h) tuple
|
||||
|
||||
def get_size_with_aspect_ratio(image_size, size, max_size=None):
|
||||
w, h = image_size
|
||||
if max_size is not None:
|
||||
min_original_size = float(min((w, h)))
|
||||
max_original_size = float(max((w, h)))
|
||||
if max_original_size / min_original_size * size > max_size:
|
||||
size = int(round(max_size * min_original_size / max_original_size))
|
||||
|
||||
if (w <= h and w == size) or (h <= w and h == size):
|
||||
return (h, w)
|
||||
|
||||
if w < h:
|
||||
ow = size
|
||||
oh = int(size * h / w)
|
||||
else:
|
||||
oh = size
|
||||
ow = int(size * w / h)
|
||||
|
||||
return (oh, ow)
|
||||
|
||||
def get_size(image_size, size, max_size=None):
|
||||
if isinstance(size, (list, tuple)):
|
||||
return size[::-1]
|
||||
else:
|
||||
return get_size_with_aspect_ratio(image_size, size, max_size)
|
||||
|
||||
size = get_size(image.size, size, max_size)
|
||||
rescaled_image = F.resize(image, size)
|
||||
|
||||
if target is None:
|
||||
return rescaled_image, None
|
||||
|
||||
ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
|
||||
ratio_width, ratio_height = ratios
|
||||
|
||||
target = target.copy()
|
||||
if "boxes" in target:
|
||||
boxes = target["boxes"]
|
||||
scaled_boxes = boxes * torch.as_tensor(
|
||||
[ratio_width, ratio_height, ratio_width, ratio_height]
|
||||
)
|
||||
target["boxes"] = scaled_boxes
|
||||
|
||||
if "area" in target:
|
||||
area = target["area"]
|
||||
scaled_area = area * (ratio_width * ratio_height)
|
||||
target["area"] = scaled_area
|
||||
|
||||
h, w = size
|
||||
target["size"] = torch.tensor([h, w])
|
||||
|
||||
if "masks" in target:
|
||||
target["masks"] = (
|
||||
interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0] > 0.5
|
||||
)
|
||||
|
||||
return rescaled_image, target
|
||||
|
||||
|
||||
def pad(image, target, padding):
|
||||
# assumes that we only pad on the bottom right corners
|
||||
padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
|
||||
if target is None:
|
||||
return padded_image, None
|
||||
target = target.copy()
|
||||
# should we do something wrt the original size?
|
||||
target["size"] = torch.tensor(padded_image.size[::-1])
|
||||
if "masks" in target:
|
||||
target["masks"] = torch.nn.functional.pad(target["masks"], (0, padding[0], 0, padding[1]))
|
||||
return padded_image, target
|
||||
|
||||
|
||||
class ResizeDebug(object):
|
||||
def __init__(self, size):
|
||||
self.size = size
|
||||
|
||||
def __call__(self, img, target):
|
||||
return resize(img, target, self.size)
|
||||
|
||||
|
||||
class RandomCrop(object):
|
||||
def __init__(self, size):
|
||||
self.size = size
|
||||
|
||||
def __call__(self, img, target):
|
||||
region = T.RandomCrop.get_params(img, self.size)
|
||||
return crop(img, target, region)
|
||||
|
||||
|
||||
class RandomSizeCrop(object):
|
||||
def __init__(self, min_size: int, max_size: int, respect_boxes: bool = False):
|
||||
# respect_boxes: True to keep all boxes
|
||||
# False to tolerence box filter
|
||||
self.min_size = min_size
|
||||
self.max_size = max_size
|
||||
self.respect_boxes = respect_boxes
|
||||
|
||||
def __call__(self, img: PIL.Image.Image, target: dict):
|
||||
init_boxes = len(target["boxes"])
|
||||
max_patience = 10
|
||||
for i in range(max_patience):
|
||||
w = random.randint(self.min_size, min(img.width, self.max_size))
|
||||
h = random.randint(self.min_size, min(img.height, self.max_size))
|
||||
region = T.RandomCrop.get_params(img, [h, w])
|
||||
result_img, result_target = crop(img, target, region)
|
||||
if (
|
||||
not self.respect_boxes
|
||||
or len(result_target["boxes"]) == init_boxes
|
||||
or i == max_patience - 1
|
||||
):
|
||||
return result_img, result_target
|
||||
return result_img, result_target
|
||||
|
||||
|
||||
class CenterCrop(object):
|
||||
def __init__(self, size):
|
||||
self.size = size
|
||||
|
||||
def __call__(self, img, target):
|
||||
image_width, image_height = img.size
|
||||
crop_height, crop_width = self.size
|
||||
crop_top = int(round((image_height - crop_height) / 2.0))
|
||||
crop_left = int(round((image_width - crop_width) / 2.0))
|
||||
return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
|
||||
|
||||
|
||||
class RandomHorizontalFlip(object):
|
||||
def __init__(self, p=0.5):
|
||||
self.p = p
|
||||
|
||||
def __call__(self, img, target):
|
||||
if random.random() < self.p:
|
||||
return hflip(img, target)
|
||||
return img, target
|
||||
|
||||
|
||||
class RandomResize(object):
|
||||
def __init__(self, sizes, max_size=None):
|
||||
assert isinstance(sizes, (list, tuple))
|
||||
self.sizes = sizes
|
||||
self.max_size = max_size
|
||||
|
||||
def __call__(self, img, target=None):
|
||||
size = random.choice(self.sizes)
|
||||
return resize(img, target, size, self.max_size)
|
||||
|
||||
|
||||
class RandomPad(object):
|
||||
def __init__(self, max_pad):
|
||||
self.max_pad = max_pad
|
||||
|
||||
def __call__(self, img, target):
|
||||
pad_x = random.randint(0, self.max_pad)
|
||||
pad_y = random.randint(0, self.max_pad)
|
||||
return pad(img, target, (pad_x, pad_y))
|
||||
|
||||
|
||||
class RandomSelect(object):
|
||||
"""
|
||||
Randomly selects between transforms1 and transforms2,
|
||||
with probability p for transforms1 and (1 - p) for transforms2
|
||||
"""
|
||||
|
||||
def __init__(self, transforms1, transforms2, p=0.5):
|
||||
self.transforms1 = transforms1
|
||||
self.transforms2 = transforms2
|
||||
self.p = p
|
||||
|
||||
def __call__(self, img, target):
|
||||
if random.random() < self.p:
|
||||
return self.transforms1(img, target)
|
||||
return self.transforms2(img, target)
|
||||
|
||||
|
||||
class ToTensor(object):
|
||||
def __call__(self, img, target):
|
||||
return F.to_tensor(img), target
|
||||
|
||||
|
||||
class RandomErasing(object):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.eraser = T.RandomErasing(*args, **kwargs)
|
||||
|
||||
def __call__(self, img, target):
|
||||
return self.eraser(img), target
|
||||
|
||||
|
||||
class Normalize(object):
|
||||
def __init__(self, mean, std):
|
||||
self.mean = mean
|
||||
self.std = std
|
||||
|
||||
def __call__(self, image, target=None):
|
||||
image = F.normalize(image, mean=self.mean, std=self.std)
|
||||
if target is None:
|
||||
return image, None
|
||||
target = target.copy()
|
||||
h, w = image.shape[-2:]
|
||||
if "boxes" in target:
|
||||
boxes = target["boxes"]
|
||||
boxes = box_xyxy_to_cxcywh(boxes)
|
||||
boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
|
||||
target["boxes"] = boxes
|
||||
return image, target
|
||||
|
||||
|
||||
class Compose(object):
|
||||
def __init__(self, transforms):
|
||||
self.transforms = transforms
|
||||
|
||||
def __call__(self, image, target):
|
||||
for t in self.transforms:
|
||||
image, target = t(image, target)
|
||||
return image, target
|
||||
|
||||
def __repr__(self):
|
||||
format_string = self.__class__.__name__ + "("
|
||||
for t in self.transforms:
|
||||
format_string += "\n"
|
||||
format_string += " {0}".format(t)
|
||||
format_string += "\n)"
|
||||
return format_string
|
Reference in New Issue
Block a user