diff --git a/README.md b/README.md index 329d7e2..46d0c4e 100644 --- a/README.md +++ b/README.md @@ -41,12 +41,6 @@ Install `Grounding DINO`: pip install --no-build-isolation -e grounding_dino ``` -Downgrade the version of the `supervision` library to `0.6.0` to use its original API for visualization (we will update our code to be compatible with the latest version of `supervision` in the future release): - -```bash -pip install supervision==0.6.0 -``` - Download the pretrained `SAM 2` checkpoints: ```bash @@ -71,12 +65,16 @@ Note that `Grounding DINO` has already been supported in [Huggingface](https://h python grounded_sam2_hf_model_demo.py ``` +> [!NOTE] +> 🚨 If you encounter network issues while using the `HuggingFace` model, you can resolve them by setting the appropriate mirror source as `export HF_ENDPOINT=https://hf-mirror.com` + - Load local pretrained Grounding DINO checkpoint and inference with Grounding DINO original API (make sure you've already downloaded the pretrained checkpoint) ```bash python grounded_sam2_local_demo.py ``` + ### Grounded-SAM-2 Image Demo (with Grounding DINO 1.5 & 1.6) We've already released our most capable open-set detection model [Grounding DINO 1.5 & 1.6](https://github.com/IDEA-Research/Grounding-DINO-1.5-API), which can be combined with SAM 2 for stronger open-set detection and segmentation capability. You can apply the API token first and run Grounded-SAM-2 with Grounding DINO 1.5 as follows: diff --git a/grounded_sam2_gd1.5_demo.py b/grounded_sam2_gd1.5_demo.py index eb67989..f6ab623 100644 --- a/grounded_sam2_gd1.5_demo.py +++ b/grounded_sam2_gd1.5_demo.py @@ -18,7 +18,7 @@ from sam2.sam2_image_predictor import SAM2ImagePredictor Prompt Grounding DINO 1.5 with Text for Box Prompt Generation with Cloud API """ # Step 1: initialize the config -token = "Your API token here" +token = "Your API token" config = Config(token) # Step 2: initialize the client @@ -101,21 +101,31 @@ elif masks.ndim == 4: Visualization the Predict Results """ +class_ids = np.array(list(range(len(class_names)))) + labels = [ f"{class_name} {confidence:.2f}" for class_name, confidence in zip(class_names, confidences) ] + +""" +Visualize image with supervision useful API +""" img = cv2.imread(img_path) detections = sv.Detections( xyxy=input_boxes, # (n, 4) - mask=masks, # (n, h, w) + mask=masks.astype(bool), # (n, h, w) + class_id=class_ids ) box_annotator = sv.BoxAnnotator() -annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections, labels=labels) +annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections) + +label_annotator = sv.LabelAnnotator() +annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels) cv2.imwrite("groundingdino_annotated_image.jpg", annotated_frame) mask_annotator = sv.MaskAnnotator() annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections) -cv2.imwrite("grounded_sam2_annotated_image_with_mask.jpg", annotated_frame) \ No newline at end of file +cv2.imwrite("grounded_sam2_annotated_image_with_mask.jpg", annotated_frame) diff --git a/grounded_sam2_hf_model_demo.py b/grounded_sam2_hf_model_demo.py index 69dedcd..b912e3f 100644 --- a/grounded_sam2_hf_model_demo.py +++ b/grounded_sam2_hf_model_demo.py @@ -2,6 +2,8 @@ import cv2 import torch import numpy as np import supervision as sv +from supervision.draw.color import ColorPalette +from supervision_utils import CUSTOM_COLOR_MAP from PIL import Image from sam2.build_sam import build_sam2 from sam2.sam2_image_predictor import SAM2ImagePredictor @@ -89,6 +91,7 @@ elif masks.ndim == 4: confidences = results[0]["scores"].cpu().numpy().tolist() class_names = results[0]["labels"] +class_ids = np.array(list(range(len(class_names)))) labels = [ f"{class_name} {confidence:.2f}" @@ -102,13 +105,21 @@ Visualize image with supervision useful API img = cv2.imread(img_path) detections = sv.Detections( xyxy=input_boxes, # (n, 4) - mask=masks, # (n, h, w) + mask=masks.astype(bool), # (n, h, w) + class_id=class_ids ) -box_annotator = sv.BoxAnnotator() -annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections, labels=labels) +""" +Note that if you want to use default color map, +you can set color=ColorPalette.DEFAULT +""" +box_annotator = sv.BoxAnnotator(color=ColorPalette.from_hex(CUSTOM_COLOR_MAP)) +annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections) + +label_annotator = sv.LabelAnnotator(color=ColorPalette.from_hex(CUSTOM_COLOR_MAP)) +annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels) cv2.imwrite("groundingdino_annotated_image.jpg", annotated_frame) -mask_annotator = sv.MaskAnnotator() +mask_annotator = sv.MaskAnnotator(color=ColorPalette.from_hex(CUSTOM_COLOR_MAP)) annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections) cv2.imwrite("grounded_sam2_annotated_image_with_mask.jpg", annotated_frame) diff --git a/grounded_sam2_local_demo.py b/grounded_sam2_local_demo.py index 7bfebba..f7e309f 100644 --- a/grounded_sam2_local_demo.py +++ b/grounded_sam2_local_demo.py @@ -64,8 +64,6 @@ masks, scores, logits = sam2_predictor.predict( multimask_output=False, ) -import pdb; pdb.set_trace() - """ Post-process the output of the model to get the masks, scores, and logits for visualization """ @@ -81,6 +79,8 @@ elif masks.ndim == 4: confidences = confidences.numpy().tolist() class_names = labels +class_ids = np.array(list(range(len(class_names)))) + labels = [ f"{class_name} {confidence:.2f}" for class_name, confidence @@ -93,11 +93,15 @@ Visualize image with supervision useful API img = cv2.imread(img_path) detections = sv.Detections( xyxy=input_boxes, # (n, 4) - mask=masks, # (n, h, w) + mask=masks.astype(bool), # (n, h, w) + class_id=class_ids ) box_annotator = sv.BoxAnnotator() -annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections, labels=labels) +annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections) + +label_annotator = sv.LabelAnnotator() +annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels) cv2.imwrite("groundingdino_annotated_image.jpg", annotated_frame) mask_annotator = sv.MaskAnnotator() diff --git a/grounded_sam2_tracking_demo.py b/grounded_sam2_tracking_demo.py index 94edcdc..ec8301f 100644 --- a/grounded_sam2_tracking_demo.py +++ b/grounded_sam2_tracking_demo.py @@ -154,7 +154,9 @@ for frame_idx, segments in video_segments.items(): class_id=np.array(object_ids, dtype=np.int32), ) box_annotator = sv.BoxAnnotator() - annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections, labels=[ID_TO_OBJECTS[i] for i in object_ids]) + annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections) + label_annotator = sv.LabelAnnotator() + annotated_frame = label_annotator.annotate(annotated_frame, detections=detections, labels=[ID_TO_OBJECTS[i] for i in object_ids]) mask_annotator = sv.MaskAnnotator() annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections) cv2.imwrite(os.path.join(save_dir, f"annotated_frame_{frame_idx:05d}.jpg"), annotated_frame) diff --git a/grounded_sam2_tracking_demo_with_gd1.5.py b/grounded_sam2_tracking_demo_with_gd1.5.py index 3f5045c..93fe2c1 100644 --- a/grounded_sam2_tracking_demo_with_gd1.5.py +++ b/grounded_sam2_tracking_demo_with_gd1.5.py @@ -13,8 +13,7 @@ import numpy as np import supervision as sv from PIL import Image from sam2.build_sam import build_sam2_video_predictor, build_sam2 -from sam2.sam2_image_predictor import SAM2ImagePredictor -from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection +from sam2.sam2_image_predictor import SAM2ImagePredictor from track_utils import sample_points_from_masks from video_utils import create_video_from_images @@ -177,7 +176,9 @@ for frame_idx, segments in video_segments.items(): class_id=np.array(object_ids, dtype=np.int32), ) box_annotator = sv.BoxAnnotator() - annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections, labels=[ID_TO_OBJECTS[i] for i in object_ids]) + annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections) + label_annotator = sv.LabelAnnotator() + annotated_frame = label_annotator.annotate(annotated_frame, detections=detections, labels=[ID_TO_OBJECTS[i] for i in object_ids]) mask_annotator = sv.MaskAnnotator() annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections) cv2.imwrite(os.path.join(save_dir, f"annotated_frame_{frame_idx:05d}.jpg"), annotated_frame) diff --git a/supervision_utils.py b/supervision_utils.py new file mode 100644 index 0000000..8a8d3d7 --- /dev/null +++ b/supervision_utils.py @@ -0,0 +1,18 @@ +CUSTOM_COLOR_MAP = [ + "#e6194b", + "#3cb44b", + "#ffe119", + "#0082c8", + "#f58231", + "#911eb4", + "#46f0f0", + "#f032e6", + "#d2f53c", + "#fabebe", + "#008080", + "#e6beff", + "#aa6e28", + "#fffac8", + "#800000", + "#aaffc3", +] \ No newline at end of file