support gsam2 image predictor model
This commit is contained in:
83
grounding_dino/demo/create_coco_dataset.py
Normal file
83
grounding_dino/demo/create_coco_dataset.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import typer
|
||||
from groundingdino.util.inference import load_model, load_image, predict
|
||||
from tqdm import tqdm
|
||||
import torchvision
|
||||
import torch
|
||||
import fiftyone as fo
|
||||
|
||||
|
||||
def main(
|
||||
image_directory: str = 'test_grounding_dino',
|
||||
text_prompt: str = 'bus, car',
|
||||
box_threshold: float = 0.15,
|
||||
text_threshold: float = 0.10,
|
||||
export_dataset: bool = False,
|
||||
view_dataset: bool = False,
|
||||
export_annotated_images: bool = True,
|
||||
weights_path : str = "groundingdino_swint_ogc.pth",
|
||||
config_path: str = "../../GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
|
||||
subsample: int = None,
|
||||
):
|
||||
|
||||
model = load_model(config_path, weights_path)
|
||||
|
||||
dataset = fo.Dataset.from_images_dir(image_directory)
|
||||
|
||||
samples = []
|
||||
|
||||
if subsample is not None:
|
||||
|
||||
if subsample < len(dataset):
|
||||
dataset = dataset.take(subsample).clone()
|
||||
|
||||
for sample in tqdm(dataset):
|
||||
|
||||
image_source, image = load_image(sample.filepath)
|
||||
|
||||
boxes, logits, phrases = predict(
|
||||
model=model,
|
||||
image=image,
|
||||
caption=text_prompt,
|
||||
box_threshold=box_threshold,
|
||||
text_threshold=text_threshold,
|
||||
)
|
||||
|
||||
detections = []
|
||||
|
||||
for box, logit, phrase in zip(boxes, logits, phrases):
|
||||
|
||||
rel_box = torchvision.ops.box_convert(box, 'cxcywh', 'xywh')
|
||||
|
||||
detections.append(
|
||||
fo.Detection(
|
||||
label=phrase,
|
||||
bounding_box=rel_box,
|
||||
confidence=logit,
|
||||
))
|
||||
|
||||
# Store detections in a field name of your choice
|
||||
sample["detections"] = fo.Detections(detections=detections)
|
||||
sample.save()
|
||||
|
||||
# loads the voxel fiftyone UI ready for viewing the dataset.
|
||||
if view_dataset:
|
||||
session = fo.launch_app(dataset)
|
||||
session.wait()
|
||||
|
||||
# exports COCO dataset ready for training
|
||||
if export_dataset:
|
||||
dataset.export(
|
||||
'coco_dataset',
|
||||
dataset_type=fo.types.COCODetectionDataset,
|
||||
)
|
||||
|
||||
# saves bounding boxes plotted on the input images to disk
|
||||
if export_annotated_images:
|
||||
dataset.draw_labels(
|
||||
'images_with_bounding_boxes',
|
||||
label_fields=['detections']
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
typer.run(main)
|
125
grounding_dino/demo/gradio_app.py
Normal file
125
grounding_dino/demo/gradio_app.py
Normal file
@@ -0,0 +1,125 @@
|
||||
import argparse
|
||||
from functools import partial
|
||||
import cv2
|
||||
import requests
|
||||
import os
|
||||
from io import BytesIO
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
|
||||
# prepare the environment
|
||||
os.system("python setup.py build develop --user")
|
||||
os.system("pip install packaging==21.3")
|
||||
os.system("pip install gradio==3.50.2")
|
||||
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
import gradio as gr
|
||||
|
||||
from groundingdino.models import build_model
|
||||
from groundingdino.util.slconfig import SLConfig
|
||||
from groundingdino.util.utils import clean_state_dict
|
||||
from groundingdino.util.inference import annotate, load_image, predict
|
||||
import groundingdino.datasets.transforms as T
|
||||
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
|
||||
|
||||
# Use this command for evaluate the Grounding DINO model
|
||||
config_file = "groundingdino/config/GroundingDINO_SwinT_OGC.py"
|
||||
ckpt_repo_id = "ShilongLiu/GroundingDINO"
|
||||
ckpt_filenmae = "groundingdino_swint_ogc.pth"
|
||||
|
||||
|
||||
def load_model_hf(model_config_path, repo_id, filename, device='cpu'):
|
||||
args = SLConfig.fromfile(model_config_path)
|
||||
model = build_model(args)
|
||||
args.device = device
|
||||
|
||||
cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
|
||||
checkpoint = torch.load(cache_file, map_location='cpu')
|
||||
log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
|
||||
print("Model loaded from {} \n => {}".format(cache_file, log))
|
||||
_ = model.eval()
|
||||
return model
|
||||
|
||||
def image_transform_grounding(init_image):
|
||||
transform = T.Compose([
|
||||
T.RandomResize([800], max_size=1333),
|
||||
T.ToTensor(),
|
||||
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
|
||||
])
|
||||
image, _ = transform(init_image, None) # 3, h, w
|
||||
return init_image, image
|
||||
|
||||
def image_transform_grounding_for_vis(init_image):
|
||||
transform = T.Compose([
|
||||
T.RandomResize([800], max_size=1333),
|
||||
])
|
||||
image, _ = transform(init_image, None) # 3, h, w
|
||||
return image
|
||||
|
||||
model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
|
||||
|
||||
def run_grounding(input_image, grounding_caption, box_threshold, text_threshold):
|
||||
init_image = input_image.convert("RGB")
|
||||
original_size = init_image.size
|
||||
|
||||
_, image_tensor = image_transform_grounding(init_image)
|
||||
image_pil: Image = image_transform_grounding_for_vis(init_image)
|
||||
|
||||
# run grounidng
|
||||
boxes, logits, phrases = predict(model, image_tensor, grounding_caption, box_threshold, text_threshold, device='cpu')
|
||||
annotated_frame = annotate(image_source=np.asarray(image_pil), boxes=boxes, logits=logits, phrases=phrases)
|
||||
image_with_box = Image.fromarray(cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB))
|
||||
|
||||
|
||||
return image_with_box
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser("Grounding DINO demo", add_help=True)
|
||||
parser.add_argument("--debug", action="store_true", help="using debug mode")
|
||||
parser.add_argument("--share", action="store_true", help="share the app")
|
||||
args = parser.parse_args()
|
||||
|
||||
block = gr.Blocks().queue()
|
||||
with block:
|
||||
gr.Markdown("# [Grounding DINO](https://github.com/IDEA-Research/GroundingDINO)")
|
||||
gr.Markdown("### Open-World Detection with Grounding DINO")
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
input_image = gr.Image(source='upload', type="pil")
|
||||
grounding_caption = gr.Textbox(label="Detection Prompt")
|
||||
run_button = gr.Button(label="Run")
|
||||
with gr.Accordion("Advanced options", open=False):
|
||||
box_threshold = gr.Slider(
|
||||
label="Box Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
|
||||
)
|
||||
text_threshold = gr.Slider(
|
||||
label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
|
||||
)
|
||||
|
||||
with gr.Column():
|
||||
gallery = gr.outputs.Image(
|
||||
type="pil",
|
||||
# label="grounding results"
|
||||
).style(full_width=True, full_height=True)
|
||||
# gallery = gr.Gallery(label="Generated images", show_label=False).style(
|
||||
# grid=[1], height="auto", container=True, full_width=True, full_height=True)
|
||||
|
||||
run_button.click(fn=run_grounding, inputs=[
|
||||
input_image, grounding_caption, box_threshold, text_threshold], outputs=[gallery])
|
||||
|
||||
|
||||
block.launch(server_name='0.0.0.0', server_port=7579, debug=args.debug, share=args.share)
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
214
grounding_dino/demo/inference_on_a_image.py
Normal file
214
grounding_dino/demo/inference_on_a_image.py
Normal file
@@ -0,0 +1,214 @@
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
import groundingdino.datasets.transforms as T
|
||||
from groundingdino.models import build_model
|
||||
from groundingdino.util import box_ops
|
||||
from groundingdino.util.slconfig import SLConfig
|
||||
from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
|
||||
from groundingdino.util.vl_utils import create_positive_map_from_span
|
||||
|
||||
|
||||
def plot_boxes_to_image(image_pil, tgt):
|
||||
H, W = tgt["size"]
|
||||
boxes = tgt["boxes"]
|
||||
labels = tgt["labels"]
|
||||
assert len(boxes) == len(labels), "boxes and labels must have same length"
|
||||
|
||||
draw = ImageDraw.Draw(image_pil)
|
||||
mask = Image.new("L", image_pil.size, 0)
|
||||
mask_draw = ImageDraw.Draw(mask)
|
||||
|
||||
# draw boxes and masks
|
||||
for box, label in zip(boxes, labels):
|
||||
# from 0..1 to 0..W, 0..H
|
||||
box = box * torch.Tensor([W, H, W, H])
|
||||
# from xywh to xyxy
|
||||
box[:2] -= box[2:] / 2
|
||||
box[2:] += box[:2]
|
||||
# random color
|
||||
color = tuple(np.random.randint(0, 255, size=3).tolist())
|
||||
# draw
|
||||
x0, y0, x1, y1 = box
|
||||
x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
|
||||
|
||||
draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
|
||||
# draw.text((x0, y0), str(label), fill=color)
|
||||
|
||||
font = ImageFont.load_default()
|
||||
if hasattr(font, "getbbox"):
|
||||
bbox = draw.textbbox((x0, y0), str(label), font)
|
||||
else:
|
||||
w, h = draw.textsize(str(label), font)
|
||||
bbox = (x0, y0, w + x0, y0 + h)
|
||||
# bbox = draw.textbbox((x0, y0), str(label))
|
||||
draw.rectangle(bbox, fill=color)
|
||||
draw.text((x0, y0), str(label), fill="white")
|
||||
|
||||
mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
|
||||
|
||||
return image_pil, mask
|
||||
|
||||
|
||||
def load_image(image_path):
|
||||
# load image
|
||||
image_pil = Image.open(image_path).convert("RGB") # load image
|
||||
|
||||
transform = T.Compose(
|
||||
[
|
||||
T.RandomResize([800], max_size=1333),
|
||||
T.ToTensor(),
|
||||
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
|
||||
]
|
||||
)
|
||||
image, _ = transform(image_pil, None) # 3, h, w
|
||||
return image_pil, image
|
||||
|
||||
|
||||
def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
|
||||
args = SLConfig.fromfile(model_config_path)
|
||||
args.device = "cuda" if not cpu_only else "cpu"
|
||||
model = build_model(args)
|
||||
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
|
||||
load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
|
||||
print(load_res)
|
||||
_ = model.eval()
|
||||
return model
|
||||
|
||||
|
||||
def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None):
|
||||
assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
|
||||
caption = caption.lower()
|
||||
caption = caption.strip()
|
||||
if not caption.endswith("."):
|
||||
caption = caption + "."
|
||||
device = "cuda" if not cpu_only else "cpu"
|
||||
model = model.to(device)
|
||||
image = image.to(device)
|
||||
with torch.no_grad():
|
||||
outputs = model(image[None], captions=[caption])
|
||||
logits = outputs["pred_logits"].sigmoid()[0] # (nq, 256)
|
||||
boxes = outputs["pred_boxes"][0] # (nq, 4)
|
||||
|
||||
# filter output
|
||||
if token_spans is None:
|
||||
logits_filt = logits.cpu().clone()
|
||||
boxes_filt = boxes.cpu().clone()
|
||||
filt_mask = logits_filt.max(dim=1)[0] > box_threshold
|
||||
logits_filt = logits_filt[filt_mask] # num_filt, 256
|
||||
boxes_filt = boxes_filt[filt_mask] # num_filt, 4
|
||||
|
||||
# get phrase
|
||||
tokenlizer = model.tokenizer
|
||||
tokenized = tokenlizer(caption)
|
||||
# build pred
|
||||
pred_phrases = []
|
||||
for logit, box in zip(logits_filt, boxes_filt):
|
||||
pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
|
||||
if with_logits:
|
||||
pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
|
||||
else:
|
||||
pred_phrases.append(pred_phrase)
|
||||
else:
|
||||
# given-phrase mode
|
||||
positive_maps = create_positive_map_from_span(
|
||||
model.tokenizer(text_prompt),
|
||||
token_span=token_spans
|
||||
).to(image.device) # n_phrase, 256
|
||||
|
||||
logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
|
||||
all_logits = []
|
||||
all_phrases = []
|
||||
all_boxes = []
|
||||
for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
|
||||
# get phrase
|
||||
phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
|
||||
# get mask
|
||||
filt_mask = logit_phr > box_threshold
|
||||
# filt box
|
||||
all_boxes.append(boxes[filt_mask])
|
||||
# filt logits
|
||||
all_logits.append(logit_phr[filt_mask])
|
||||
if with_logits:
|
||||
logit_phr_num = logit_phr[filt_mask]
|
||||
all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
|
||||
else:
|
||||
all_phrases.extend([phrase for _ in range(len(filt_mask))])
|
||||
boxes_filt = torch.cat(all_boxes, dim=0).cpu()
|
||||
pred_phrases = all_phrases
|
||||
|
||||
|
||||
return boxes_filt, pred_phrases
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser("Grounding DINO example", add_help=True)
|
||||
parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
|
||||
parser.add_argument(
|
||||
"--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file"
|
||||
)
|
||||
parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
|
||||
parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
|
||||
parser.add_argument(
|
||||
"--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
|
||||
)
|
||||
|
||||
parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
|
||||
parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
|
||||
parser.add_argument("--token_spans", type=str, default=None, help=
|
||||
"The positions of start and end positions of phrases of interest. \
|
||||
For example, a caption is 'a cat and a dog', \
|
||||
if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
|
||||
if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
|
||||
")
|
||||
|
||||
parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
|
||||
args = parser.parse_args()
|
||||
|
||||
# cfg
|
||||
config_file = args.config_file # change the path of the model config file
|
||||
checkpoint_path = args.checkpoint_path # change the path of the model
|
||||
image_path = args.image_path
|
||||
text_prompt = args.text_prompt
|
||||
output_dir = args.output_dir
|
||||
box_threshold = args.box_threshold
|
||||
text_threshold = args.text_threshold
|
||||
token_spans = args.token_spans
|
||||
|
||||
# make dir
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
# load image
|
||||
image_pil, image = load_image(image_path)
|
||||
# load model
|
||||
model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
|
||||
|
||||
# visualize raw image
|
||||
image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
|
||||
|
||||
# set the text_threshold to None if token_spans is set.
|
||||
if token_spans is not None:
|
||||
text_threshold = None
|
||||
print("Using token_spans. Set the text_threshold to None.")
|
||||
|
||||
|
||||
# run model
|
||||
boxes_filt, pred_phrases = get_grounding_output(
|
||||
model, image, text_prompt, box_threshold, text_threshold, cpu_only=args.cpu_only, token_spans=eval(f"{token_spans}")
|
||||
)
|
||||
|
||||
# visualize pred
|
||||
size = image_pil.size
|
||||
pred_dict = {
|
||||
"boxes": boxes_filt,
|
||||
"size": [size[1], size[0]], # H,W
|
||||
"labels": pred_phrases,
|
||||
}
|
||||
# import ipdb; ipdb.set_trace()
|
||||
image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
|
||||
image_with_box.save(os.path.join(output_dir, "pred.jpg"))
|
233
grounding_dino/demo/test_ap_on_coco.py
Normal file
233
grounding_dino/demo/test_ap_on_coco.py
Normal file
@@ -0,0 +1,233 @@
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import DataLoader, DistributedSampler
|
||||
|
||||
from groundingdino.models import build_model
|
||||
import groundingdino.datasets.transforms as T
|
||||
from groundingdino.util import box_ops, get_tokenlizer
|
||||
from groundingdino.util.misc import clean_state_dict, collate_fn
|
||||
from groundingdino.util.slconfig import SLConfig
|
||||
|
||||
# from torchvision.datasets import CocoDetection
|
||||
import torchvision
|
||||
|
||||
from groundingdino.util.vl_utils import build_captions_and_token_span, create_positive_map_from_span
|
||||
from groundingdino.datasets.cocogrounding_eval import CocoGroundingEvaluator
|
||||
|
||||
|
||||
def load_model(model_config_path: str, model_checkpoint_path: str, device: str = "cuda"):
|
||||
args = SLConfig.fromfile(model_config_path)
|
||||
args.device = device
|
||||
model = build_model(args)
|
||||
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
|
||||
model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
|
||||
class CocoDetection(torchvision.datasets.CocoDetection):
|
||||
def __init__(self, img_folder, ann_file, transforms):
|
||||
super().__init__(img_folder, ann_file)
|
||||
self._transforms = transforms
|
||||
|
||||
def __getitem__(self, idx):
|
||||
img, target = super().__getitem__(idx) # target: list
|
||||
|
||||
# import ipdb; ipdb.set_trace()
|
||||
|
||||
w, h = img.size
|
||||
boxes = [obj["bbox"] for obj in target]
|
||||
boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
|
||||
boxes[:, 2:] += boxes[:, :2] # xywh -> xyxy
|
||||
boxes[:, 0::2].clamp_(min=0, max=w)
|
||||
boxes[:, 1::2].clamp_(min=0, max=h)
|
||||
# filt invalid boxes/masks/keypoints
|
||||
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
|
||||
boxes = boxes[keep]
|
||||
|
||||
target_new = {}
|
||||
image_id = self.ids[idx]
|
||||
target_new["image_id"] = image_id
|
||||
target_new["boxes"] = boxes
|
||||
target_new["orig_size"] = torch.as_tensor([int(h), int(w)])
|
||||
|
||||
if self._transforms is not None:
|
||||
img, target = self._transforms(img, target_new)
|
||||
|
||||
return img, target
|
||||
|
||||
|
||||
class PostProcessCocoGrounding(nn.Module):
|
||||
""" This module converts the model's output into the format expected by the coco api"""
|
||||
|
||||
def __init__(self, num_select=300, coco_api=None, tokenlizer=None) -> None:
|
||||
super().__init__()
|
||||
self.num_select = num_select
|
||||
|
||||
assert coco_api is not None
|
||||
category_dict = coco_api.dataset['categories']
|
||||
cat_list = [item['name'] for item in category_dict]
|
||||
captions, cat2tokenspan = build_captions_and_token_span(cat_list, True)
|
||||
tokenspanlist = [cat2tokenspan[cat] for cat in cat_list]
|
||||
positive_map = create_positive_map_from_span(
|
||||
tokenlizer(captions), tokenspanlist) # 80, 256. normed
|
||||
|
||||
id_map = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 13, 12: 14, 13: 15, 14: 16, 15: 17, 16: 18, 17: 19, 18: 20, 19: 21, 20: 22, 21: 23, 22: 24, 23: 25, 24: 27, 25: 28, 26: 31, 27: 32, 28: 33, 29: 34, 30: 35, 31: 36, 32: 37, 33: 38, 34: 39, 35: 40, 36: 41, 37: 42, 38: 43, 39: 44, 40: 46,
|
||||
41: 47, 42: 48, 43: 49, 44: 50, 45: 51, 46: 52, 47: 53, 48: 54, 49: 55, 50: 56, 51: 57, 52: 58, 53: 59, 54: 60, 55: 61, 56: 62, 57: 63, 58: 64, 59: 65, 60: 67, 61: 70, 62: 72, 63: 73, 64: 74, 65: 75, 66: 76, 67: 77, 68: 78, 69: 79, 70: 80, 71: 81, 72: 82, 73: 84, 74: 85, 75: 86, 76: 87, 77: 88, 78: 89, 79: 90}
|
||||
|
||||
# build a mapping from label_id to pos_map
|
||||
new_pos_map = torch.zeros((91, 256))
|
||||
for k, v in id_map.items():
|
||||
new_pos_map[v] = positive_map[k]
|
||||
self.positive_map = new_pos_map
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(self, outputs, target_sizes, not_to_xyxy=False):
|
||||
""" Perform the computation
|
||||
Parameters:
|
||||
outputs: raw outputs of the model
|
||||
target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
|
||||
For evaluation, this must be the original image size (before any data augmentation)
|
||||
For visualization, this should be the image size after data augment, but before padding
|
||||
"""
|
||||
num_select = self.num_select
|
||||
out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
|
||||
|
||||
# pos map to logit
|
||||
prob_to_token = out_logits.sigmoid() # bs, 100, 256
|
||||
pos_maps = self.positive_map.to(prob_to_token.device)
|
||||
# (bs, 100, 256) @ (91, 256).T -> (bs, 100, 91)
|
||||
prob_to_label = prob_to_token @ pos_maps.T
|
||||
|
||||
# if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
|
||||
# import ipdb; ipdb.set_trace()
|
||||
|
||||
assert len(out_logits) == len(target_sizes)
|
||||
assert target_sizes.shape[1] == 2
|
||||
|
||||
prob = prob_to_label
|
||||
topk_values, topk_indexes = torch.topk(
|
||||
prob.view(out_logits.shape[0], -1), num_select, dim=1)
|
||||
scores = topk_values
|
||||
topk_boxes = topk_indexes // prob.shape[2]
|
||||
labels = topk_indexes % prob.shape[2]
|
||||
|
||||
if not_to_xyxy:
|
||||
boxes = out_bbox
|
||||
else:
|
||||
boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
|
||||
|
||||
boxes = torch.gather(
|
||||
boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
|
||||
|
||||
# and from relative [0, 1] to absolute [0, height] coordinates
|
||||
img_h, img_w = target_sizes.unbind(1)
|
||||
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
|
||||
boxes = boxes * scale_fct[:, None, :]
|
||||
|
||||
results = [{'scores': s, 'labels': l, 'boxes': b}
|
||||
for s, l, b in zip(scores, labels, boxes)]
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main(args):
|
||||
# config
|
||||
cfg = SLConfig.fromfile(args.config_file)
|
||||
|
||||
# build model
|
||||
model = load_model(args.config_file, args.checkpoint_path)
|
||||
model = model.to(args.device)
|
||||
model = model.eval()
|
||||
|
||||
# build dataloader
|
||||
transform = T.Compose(
|
||||
[
|
||||
T.RandomResize([800], max_size=1333),
|
||||
T.ToTensor(),
|
||||
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
|
||||
]
|
||||
)
|
||||
dataset = CocoDetection(
|
||||
args.image_dir, args.anno_path, transforms=transform)
|
||||
data_loader = DataLoader(
|
||||
dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn)
|
||||
|
||||
# build post processor
|
||||
tokenlizer = get_tokenlizer.get_tokenlizer(cfg.text_encoder_type)
|
||||
postprocessor = PostProcessCocoGrounding(
|
||||
coco_api=dataset.coco, tokenlizer=tokenlizer)
|
||||
|
||||
# build evaluator
|
||||
evaluator = CocoGroundingEvaluator(
|
||||
dataset.coco, iou_types=("bbox",), useCats=True)
|
||||
|
||||
# build captions
|
||||
category_dict = dataset.coco.dataset['categories']
|
||||
cat_list = [item['name'] for item in category_dict]
|
||||
caption = " . ".join(cat_list) + ' .'
|
||||
print("Input text prompt:", caption)
|
||||
|
||||
# run inference
|
||||
start = time.time()
|
||||
for i, (images, targets) in enumerate(data_loader):
|
||||
# get images and captions
|
||||
images = images.tensors.to(args.device)
|
||||
bs = images.shape[0]
|
||||
input_captions = [caption] * bs
|
||||
|
||||
# feed to the model
|
||||
outputs = model(images, captions=input_captions)
|
||||
|
||||
orig_target_sizes = torch.stack(
|
||||
[t["orig_size"] for t in targets], dim=0).to(images.device)
|
||||
results = postprocessor(outputs, orig_target_sizes)
|
||||
cocogrounding_res = {
|
||||
target["image_id"]: output for target, output in zip(targets, results)}
|
||||
evaluator.update(cocogrounding_res)
|
||||
|
||||
if (i+1) % 30 == 0:
|
||||
used_time = time.time() - start
|
||||
eta = len(data_loader) / (i+1e-5) * used_time - used_time
|
||||
print(
|
||||
f"processed {i}/{len(data_loader)} images. time: {used_time:.2f}s, ETA: {eta:.2f}s")
|
||||
|
||||
evaluator.synchronize_between_processes()
|
||||
evaluator.accumulate()
|
||||
evaluator.summarize()
|
||||
|
||||
print("Final results:", evaluator.coco_eval["bbox"].stats.tolist())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
"Grounding DINO eval on COCO", add_help=True)
|
||||
# load model
|
||||
parser.add_argument("--config_file", "-c", type=str,
|
||||
required=True, help="path to config file")
|
||||
parser.add_argument(
|
||||
"--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file"
|
||||
)
|
||||
parser.add_argument("--device", type=str, default="cuda",
|
||||
help="running device (default: cuda)")
|
||||
|
||||
# post processing
|
||||
parser.add_argument("--num_select", type=int, default=300,
|
||||
help="number of topk to select")
|
||||
|
||||
# coco info
|
||||
parser.add_argument("--anno_path", type=str,
|
||||
required=True, help="coco root")
|
||||
parser.add_argument("--image_dir", type=str,
|
||||
required=True, help="coco image dir")
|
||||
parser.add_argument("--num_workers", type=int, default=4,
|
||||
help="number of workers for dataloader")
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
Reference in New Issue
Block a user