support gsam2 image predictor model

2024-08-01 17:05:01 +08:00
parent 72501fecf8
commit 1dacb47840
333 changed files with 24764 additions and 0 deletions
--- a/grounding_dino/demo/create_coco_dataset.py
+++ b/grounding_dino/demo/create_coco_dataset.py
@@ -0,0 +1,83 @@
+import typer
+from groundingdino.util.inference import load_model, load_image, predict
+from tqdm import tqdm
+import torchvision
+import torch
+import fiftyone as fo
+
+
+def main(
+        image_directory: str = 'test_grounding_dino',
+        text_prompt: str = 'bus, car',
+        box_threshold: float = 0.15, 
+        text_threshold: float = 0.10,
+        export_dataset: bool = False,
+        view_dataset: bool = False,
+        export_annotated_images: bool = True,
+        weights_path : str = "groundingdino_swint_ogc.pth",
+        config_path: str = "../../GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
+        subsample: int = None,
+    ):
+
+    model = load_model(config_path, weights_path)
+    
+    dataset = fo.Dataset.from_images_dir(image_directory)
+
+    samples = []
+
+    if subsample is not None: 
+        
+        if subsample < len(dataset):
+            dataset = dataset.take(subsample).clone()
+    
+    for sample in tqdm(dataset):
+
+        image_source, image = load_image(sample.filepath)
+
+        boxes, logits, phrases = predict(
+            model=model, 
+            image=image, 
+            caption=text_prompt, 
+            box_threshold=box_threshold, 
+            text_threshold=text_threshold,
+        )
+
+        detections = [] 
+
+        for box, logit, phrase in zip(boxes, logits, phrases):
+
+            rel_box = torchvision.ops.box_convert(box, 'cxcywh', 'xywh')
+
+            detections.append(
+                fo.Detection(
+                    label=phrase, 
+                    bounding_box=rel_box,
+                    confidence=logit,
+            ))
+
+        # Store detections in a field name of your choice
+        sample["detections"] = fo.Detections(detections=detections)
+        sample.save()
+
+    # loads the voxel fiftyone UI ready for viewing the dataset.
+    if view_dataset:
+        session = fo.launch_app(dataset)
+        session.wait()
+        
+    # exports COCO dataset ready for training
+    if export_dataset:
+        dataset.export(
+            'coco_dataset',
+            dataset_type=fo.types.COCODetectionDataset,
+        )
+        
+    # saves bounding boxes plotted on the input images to disk
+    if export_annotated_images:
+        dataset.draw_labels(
+            'images_with_bounding_boxes',
+            label_fields=['detections']
+        )
+
+
+if __name__ == '__main__':
+    typer.run(main)
--- a/grounding_dino/demo/gradio_app.py
+++ b/grounding_dino/demo/gradio_app.py
@@ -0,0 +1,125 @@
+import argparse
+from functools import partial
+import cv2
+import requests
+import os
+from io import BytesIO
+from PIL import Image
+import numpy as np
+from pathlib import Path
+
+
+import warnings
+
+import torch
+
+# prepare the environment
+os.system("python setup.py build develop --user")
+os.system("pip install packaging==21.3")
+os.system("pip install gradio==3.50.2")
+
+
+warnings.filterwarnings("ignore")
+
+import gradio as gr
+
+from groundingdino.models import build_model
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict
+from groundingdino.util.inference import annotate, load_image, predict
+import groundingdino.datasets.transforms as T
+
+from huggingface_hub import hf_hub_download
+
+
+
+# Use this command for evaluate the Grounding DINO model
+config_file = "groundingdino/config/GroundingDINO_SwinT_OGC.py"
+ckpt_repo_id = "ShilongLiu/GroundingDINO"
+ckpt_filenmae = "groundingdino_swint_ogc.pth"
+
+
+def load_model_hf(model_config_path, repo_id, filename, device='cpu'):
+    args = SLConfig.fromfile(model_config_path) 
+    model = build_model(args)
+    args.device = device
+
+    cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
+    checkpoint = torch.load(cache_file, map_location='cpu')
+    log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
+    print("Model loaded from {} \n => {}".format(cache_file, log))
+    _ = model.eval()
+    return model    
+
+def image_transform_grounding(init_image):
+    transform = T.Compose([
+        T.RandomResize([800], max_size=1333),
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    image, _ = transform(init_image, None) # 3, h, w
+    return init_image, image
+
+def image_transform_grounding_for_vis(init_image):
+    transform = T.Compose([
+        T.RandomResize([800], max_size=1333),
+    ])
+    image, _ = transform(init_image, None) # 3, h, w
+    return image
+
+model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
+
+def run_grounding(input_image, grounding_caption, box_threshold, text_threshold):
+    init_image = input_image.convert("RGB")
+    original_size = init_image.size
+
+    _, image_tensor = image_transform_grounding(init_image)
+    image_pil: Image = image_transform_grounding_for_vis(init_image)
+
+    # run grounidng
+    boxes, logits, phrases = predict(model, image_tensor, grounding_caption, box_threshold, text_threshold, device='cpu')
+    annotated_frame = annotate(image_source=np.asarray(image_pil), boxes=boxes, logits=logits, phrases=phrases)
+    image_with_box = Image.fromarray(cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB))
+
+
+    return image_with_box
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser("Grounding DINO demo", add_help=True)
+    parser.add_argument("--debug", action="store_true", help="using debug mode")
+    parser.add_argument("--share", action="store_true", help="share the app")
+    args = parser.parse_args()
+
+    block = gr.Blocks().queue()
+    with block:
+        gr.Markdown("# [Grounding DINO](https://github.com/IDEA-Research/GroundingDINO)")
+        gr.Markdown("### Open-World Detection with Grounding DINO")
+
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(source='upload', type="pil")
+                grounding_caption = gr.Textbox(label="Detection Prompt")
+                run_button = gr.Button(label="Run")
+                with gr.Accordion("Advanced options", open=False):
+                    box_threshold = gr.Slider(
+                        label="Box Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
+                    )
+                    text_threshold = gr.Slider(
+                        label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
+                    )
+
+            with gr.Column():
+                gallery = gr.outputs.Image(
+                    type="pil",
+                    # label="grounding results"
+                ).style(full_width=True, full_height=True)
+                # gallery = gr.Gallery(label="Generated images", show_label=False).style(
+                #         grid=[1], height="auto", container=True, full_width=True, full_height=True)
+
+        run_button.click(fn=run_grounding, inputs=[
+                        input_image, grounding_caption, box_threshold, text_threshold], outputs=[gallery])
+
+
+    block.launch(server_name='0.0.0.0', server_port=7579, debug=args.debug, share=args.share)
+
--- a/grounding_dino/demo/image_editing_with_groundingdino_gligen.ipynb
+++ b/grounding_dino/demo/image_editing_with_groundingdino_gligen.ipynb
--- a/grounding_dino/demo/image_editing_with_groundingdino_stablediffusion.ipynb
+++ b/grounding_dino/demo/image_editing_with_groundingdino_stablediffusion.ipynb
--- a/grounding_dino/demo/inference_on_a_image.py
+++ b/grounding_dino/demo/inference_on_a_image.py
@@ -0,0 +1,214 @@
+import argparse
+import os
+import sys
+
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+
+import groundingdino.datasets.transforms as T
+from groundingdino.models import build_model
+from groundingdino.util import box_ops
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+from groundingdino.util.vl_utils import create_positive_map_from_span
+
+
+def plot_boxes_to_image(image_pil, tgt):
+    H, W = tgt["size"]
+    boxes = tgt["boxes"]
+    labels = tgt["labels"]
+    assert len(boxes) == len(labels), "boxes and labels must have same length"
+
+    draw = ImageDraw.Draw(image_pil)
+    mask = Image.new("L", image_pil.size, 0)
+    mask_draw = ImageDraw.Draw(mask)
+
+    # draw boxes and masks
+    for box, label in zip(boxes, labels):
+        # from 0..1 to 0..W, 0..H
+        box = box * torch.Tensor([W, H, W, H])
+        # from xywh to xyxy
+        box[:2] -= box[2:] / 2
+        box[2:] += box[:2]
+        # random color
+        color = tuple(np.random.randint(0, 255, size=3).tolist())
+        # draw
+        x0, y0, x1, y1 = box
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
+        # draw.text((x0, y0), str(label), fill=color)
+
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), str(label), font)
+        else:
+            w, h = draw.textsize(str(label), font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        # bbox = draw.textbbox((x0, y0), str(label))
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), str(label), fill="white")
+
+        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
+
+    return image_pil, mask
+
+
+def load_image(image_path):
+    # load image
+    image_pil = Image.open(image_path).convert("RGB")  # load image
+
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image_pil, image
+
+
+def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = "cuda" if not cpu_only else "cpu"
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    print(load_res)
+    _ = model.eval()
+    return model
+
+
+def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None):
+    assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    device = "cuda" if not cpu_only else "cpu"
+    model = model.to(device)
+    image = image.to(device)
+    with torch.no_grad():
+        outputs = model(image[None], captions=[caption])
+    logits = outputs["pred_logits"].sigmoid()[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"][0]  # (nq, 4)
+
+    # filter output
+    if token_spans is None:
+        logits_filt = logits.cpu().clone()
+        boxes_filt = boxes.cpu().clone()
+        filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+        logits_filt = logits_filt[filt_mask]  # num_filt, 256
+        boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+
+        # get phrase
+        tokenlizer = model.tokenizer
+        tokenized = tokenlizer(caption)
+        # build pred
+        pred_phrases = []
+        for logit, box in zip(logits_filt, boxes_filt):
+            pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+            if with_logits:
+                pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+            else:
+                pred_phrases.append(pred_phrase)
+    else:
+        # given-phrase mode
+        positive_maps = create_positive_map_from_span(
+            model.tokenizer(text_prompt),
+            token_span=token_spans
+        ).to(image.device) # n_phrase, 256
+
+        logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
+        all_logits = []
+        all_phrases = []
+        all_boxes = []
+        for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
+            # get phrase
+            phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
+            # get mask
+            filt_mask = logit_phr > box_threshold
+            # filt box
+            all_boxes.append(boxes[filt_mask])
+            # filt logits
+            all_logits.append(logit_phr[filt_mask])
+            if with_logits:
+                logit_phr_num = logit_phr[filt_mask]
+                all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
+            else:
+                all_phrases.extend([phrase for _ in range(len(filt_mask))])
+        boxes_filt = torch.cat(all_boxes, dim=0).cpu()
+        pred_phrases = all_phrases
+
+
+    return boxes_filt, pred_phrases
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser("Grounding DINO example", add_help=True)
+    parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
+    parser.add_argument(
+        "--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file"
+    )
+    parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
+    parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
+    parser.add_argument(
+        "--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
+    )
+
+    parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
+    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
+    parser.add_argument("--token_spans", type=str, default=None, help=
+                        "The positions of start and end positions of phrases of interest. \
+                        For example, a caption is 'a cat and a dog', \
+                        if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
+                        if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
+                        ")
+
+    parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
+    args = parser.parse_args()
+
+    # cfg
+    config_file = args.config_file  # change the path of the model config file
+    checkpoint_path = args.checkpoint_path  # change the path of the model
+    image_path = args.image_path
+    text_prompt = args.text_prompt
+    output_dir = args.output_dir
+    box_threshold = args.box_threshold
+    text_threshold = args.text_threshold
+    token_spans = args.token_spans
+
+    # make dir
+    os.makedirs(output_dir, exist_ok=True)
+    # load image
+    image_pil, image = load_image(image_path)
+    # load model
+    model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
+
+    # visualize raw image
+    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
+
+    # set the text_threshold to None if token_spans is set.
+    if token_spans is not None:
+        text_threshold = None
+        print("Using token_spans. Set the text_threshold to None.")
+
+
+    # run model
+    boxes_filt, pred_phrases = get_grounding_output(
+        model, image, text_prompt, box_threshold, text_threshold, cpu_only=args.cpu_only, token_spans=eval(f"{token_spans}")
+    )
+
+    # visualize pred
+    size = image_pil.size
+    pred_dict = {
+        "boxes": boxes_filt,
+        "size": [size[1], size[0]],  # H,W
+        "labels": pred_phrases,
+    }
+    # import ipdb; ipdb.set_trace()
+    image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
+    image_with_box.save(os.path.join(output_dir, "pred.jpg"))
--- a/grounding_dino/demo/test_ap_on_coco.py
+++ b/grounding_dino/demo/test_ap_on_coco.py
@@ -0,0 +1,233 @@
+import argparse
+import os
+import sys
+import time
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, DistributedSampler
+
+from groundingdino.models import build_model
+import groundingdino.datasets.transforms as T
+from groundingdino.util import box_ops, get_tokenlizer
+from groundingdino.util.misc import clean_state_dict, collate_fn
+from groundingdino.util.slconfig import SLConfig
+
+# from torchvision.datasets import CocoDetection
+import torchvision
+
+from groundingdino.util.vl_utils import build_captions_and_token_span, create_positive_map_from_span
+from groundingdino.datasets.cocogrounding_eval import CocoGroundingEvaluator
+
+
+def load_model(model_config_path: str, model_checkpoint_path: str, device: str = "cuda"):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = device
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    model.eval()
+    return model
+
+
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms):
+        super().__init__(img_folder, ann_file)
+        self._transforms = transforms
+
+    def __getitem__(self, idx):
+        img, target = super().__getitem__(idx)  # target: list
+
+        # import ipdb; ipdb.set_trace()
+
+        w, h = img.size
+        boxes = [obj["bbox"] for obj in target]
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]  # xywh -> xyxy
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        # filt invalid boxes/masks/keypoints
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+
+        target_new = {}
+        image_id = self.ids[idx]
+        target_new["image_id"] = image_id
+        target_new["boxes"] = boxes
+        target_new["orig_size"] = torch.as_tensor([int(h), int(w)])
+
+        if self._transforms is not None:
+            img, target = self._transforms(img, target_new)
+
+        return img, target
+
+
+class PostProcessCocoGrounding(nn.Module):
+    """ This module converts the model's output into the format expected by the coco api"""
+
+    def __init__(self, num_select=300, coco_api=None, tokenlizer=None) -> None:
+        super().__init__()
+        self.num_select = num_select
+
+        assert coco_api is not None
+        category_dict = coco_api.dataset['categories']
+        cat_list = [item['name'] for item in category_dict]
+        captions, cat2tokenspan = build_captions_and_token_span(cat_list, True)
+        tokenspanlist = [cat2tokenspan[cat] for cat in cat_list]
+        positive_map = create_positive_map_from_span(
+            tokenlizer(captions), tokenspanlist)  # 80, 256. normed
+
+        id_map = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 13, 12: 14, 13: 15, 14: 16, 15: 17, 16: 18, 17: 19, 18: 20, 19: 21, 20: 22, 21: 23, 22: 24, 23: 25, 24: 27, 25: 28, 26: 31, 27: 32, 28: 33, 29: 34, 30: 35, 31: 36, 32: 37, 33: 38, 34: 39, 35: 40, 36: 41, 37: 42, 38: 43, 39: 44, 40: 46,
+                  41: 47, 42: 48, 43: 49, 44: 50, 45: 51, 46: 52, 47: 53, 48: 54, 49: 55, 50: 56, 51: 57, 52: 58, 53: 59, 54: 60, 55: 61, 56: 62, 57: 63, 58: 64, 59: 65, 60: 67, 61: 70, 62: 72, 63: 73, 64: 74, 65: 75, 66: 76, 67: 77, 68: 78, 69: 79, 70: 80, 71: 81, 72: 82, 73: 84, 74: 85, 75: 86, 76: 87, 77: 88, 78: 89, 79: 90}
+
+        # build a mapping from label_id to pos_map
+        new_pos_map = torch.zeros((91, 256))
+        for k, v in id_map.items():
+            new_pos_map[v] = positive_map[k]
+        self.positive_map = new_pos_map
+
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes, not_to_xyxy=False):
+        """ Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
+                          For evaluation, this must be the original image size (before any data augmentation)
+                          For visualization, this should be the image size after data augment, but before padding
+        """
+        num_select = self.num_select
+        out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
+
+        # pos map to logit
+        prob_to_token = out_logits.sigmoid()  # bs, 100, 256
+        pos_maps = self.positive_map.to(prob_to_token.device)
+        # (bs, 100, 256) @ (91, 256).T -> (bs, 100, 91)
+        prob_to_label = prob_to_token @ pos_maps.T
+
+        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+
+        assert len(out_logits) == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+
+        prob = prob_to_label
+        topk_values, topk_indexes = torch.topk(
+            prob.view(out_logits.shape[0], -1), num_select, dim=1)
+        scores = topk_values
+        topk_boxes = topk_indexes // prob.shape[2]
+        labels = topk_indexes % prob.shape[2]
+
+        if not_to_xyxy:
+            boxes = out_bbox
+        else:
+            boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+
+        boxes = torch.gather(
+            boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{'scores': s, 'labels': l, 'boxes': b}
+                   for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+
+def main(args):
+    # config
+    cfg = SLConfig.fromfile(args.config_file)
+
+    # build model
+    model = load_model(args.config_file, args.checkpoint_path)
+    model = model.to(args.device)
+    model = model.eval()
+
+    # build dataloader
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    dataset = CocoDetection(
+        args.image_dir, args.anno_path, transforms=transform)
+    data_loader = DataLoader(
+        dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn)
+
+    # build post processor
+    tokenlizer = get_tokenlizer.get_tokenlizer(cfg.text_encoder_type)
+    postprocessor = PostProcessCocoGrounding(
+        coco_api=dataset.coco, tokenlizer=tokenlizer)
+
+    # build evaluator
+    evaluator = CocoGroundingEvaluator(
+        dataset.coco, iou_types=("bbox",), useCats=True)
+
+    # build captions
+    category_dict = dataset.coco.dataset['categories']
+    cat_list = [item['name'] for item in category_dict]
+    caption = " . ".join(cat_list) + ' .'
+    print("Input text prompt:", caption)
+
+    # run inference
+    start = time.time()
+    for i, (images, targets) in enumerate(data_loader):
+        # get images and captions
+        images = images.tensors.to(args.device)
+        bs = images.shape[0]
+        input_captions = [caption] * bs
+
+        # feed to the model
+        outputs = model(images, captions=input_captions)
+
+        orig_target_sizes = torch.stack(
+            [t["orig_size"] for t in targets], dim=0).to(images.device)
+        results = postprocessor(outputs, orig_target_sizes)
+        cocogrounding_res = {
+            target["image_id"]: output for target, output in zip(targets, results)}
+        evaluator.update(cocogrounding_res)
+
+        if (i+1) % 30 == 0:
+            used_time = time.time() - start
+            eta = len(data_loader) / (i+1e-5) * used_time - used_time
+            print(
+                f"processed {i}/{len(data_loader)} images. time: {used_time:.2f}s, ETA: {eta:.2f}s")
+
+    evaluator.synchronize_between_processes()
+    evaluator.accumulate()
+    evaluator.summarize()
+
+    print("Final results:", evaluator.coco_eval["bbox"].stats.tolist())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        "Grounding DINO eval on COCO", add_help=True)
+    # load model
+    parser.add_argument("--config_file", "-c", type=str,
+                        required=True, help="path to config file")
+    parser.add_argument(
+        "--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file"
+    )
+    parser.add_argument("--device", type=str, default="cuda",
+                        help="running device (default: cuda)")
+
+    # post processing
+    parser.add_argument("--num_select", type=int, default=300,
+                        help="number of topk to select")
+
+    # coco info
+    parser.add_argument("--anno_path", type=str,
+                        required=True, help="coco root")
+    parser.add_argument("--image_dir", type=str,
+                        required=True, help="coco image dir")
+    parser.add_argument("--num_workers", type=int, default=4,
+                        help="number of workers for dataloader")
+    args = parser.parse_args()
+
+    main(args)