upgrade supervision to 0.22.0 and refine custom API usage
This commit is contained in:
10
README.md
10
README.md
@@ -41,12 +41,6 @@ Install `Grounding DINO`:
|
|||||||
pip install --no-build-isolation -e grounding_dino
|
pip install --no-build-isolation -e grounding_dino
|
||||||
```
|
```
|
||||||
|
|
||||||
Downgrade the version of the `supervision` library to `0.6.0` to use its original API for visualization (we will update our code to be compatible with the latest version of `supervision` in the future release):
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install supervision==0.6.0
|
|
||||||
```
|
|
||||||
|
|
||||||
Download the pretrained `SAM 2` checkpoints:
|
Download the pretrained `SAM 2` checkpoints:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -71,12 +65,16 @@ Note that `Grounding DINO` has already been supported in [Huggingface](https://h
|
|||||||
python grounded_sam2_hf_model_demo.py
|
python grounded_sam2_hf_model_demo.py
|
||||||
```
|
```
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> 🚨 If you encounter network issues while using the `HuggingFace` model, you can resolve them by setting the appropriate mirror source as `export HF_ENDPOINT=https://hf-mirror.com`
|
||||||
|
|
||||||
- Load local pretrained Grounding DINO checkpoint and inference with Grounding DINO original API (make sure you've already downloaded the pretrained checkpoint)
|
- Load local pretrained Grounding DINO checkpoint and inference with Grounding DINO original API (make sure you've already downloaded the pretrained checkpoint)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python grounded_sam2_local_demo.py
|
python grounded_sam2_local_demo.py
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
### Grounded-SAM-2 Image Demo (with Grounding DINO 1.5 & 1.6)
|
### Grounded-SAM-2 Image Demo (with Grounding DINO 1.5 & 1.6)
|
||||||
|
|
||||||
We've already released our most capable open-set detection model [Grounding DINO 1.5 & 1.6](https://github.com/IDEA-Research/Grounding-DINO-1.5-API), which can be combined with SAM 2 for stronger open-set detection and segmentation capability. You can apply the API token first and run Grounded-SAM-2 with Grounding DINO 1.5 as follows:
|
We've already released our most capable open-set detection model [Grounding DINO 1.5 & 1.6](https://github.com/IDEA-Research/Grounding-DINO-1.5-API), which can be combined with SAM 2 for stronger open-set detection and segmentation capability. You can apply the API token first and run Grounded-SAM-2 with Grounding DINO 1.5 as follows:
|
||||||
|
@@ -18,7 +18,7 @@ from sam2.sam2_image_predictor import SAM2ImagePredictor
|
|||||||
Prompt Grounding DINO 1.5 with Text for Box Prompt Generation with Cloud API
|
Prompt Grounding DINO 1.5 with Text for Box Prompt Generation with Cloud API
|
||||||
"""
|
"""
|
||||||
# Step 1: initialize the config
|
# Step 1: initialize the config
|
||||||
token = "Your API token here"
|
token = "Your API token"
|
||||||
config = Config(token)
|
config = Config(token)
|
||||||
|
|
||||||
# Step 2: initialize the client
|
# Step 2: initialize the client
|
||||||
@@ -101,21 +101,31 @@ elif masks.ndim == 4:
|
|||||||
Visualization the Predict Results
|
Visualization the Predict Results
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
class_ids = np.array(list(range(len(class_names))))
|
||||||
|
|
||||||
labels = [
|
labels = [
|
||||||
f"{class_name} {confidence:.2f}"
|
f"{class_name} {confidence:.2f}"
|
||||||
for class_name, confidence
|
for class_name, confidence
|
||||||
in zip(class_names, confidences)
|
in zip(class_names, confidences)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
"""
|
||||||
|
Visualize image with supervision useful API
|
||||||
|
"""
|
||||||
img = cv2.imread(img_path)
|
img = cv2.imread(img_path)
|
||||||
detections = sv.Detections(
|
detections = sv.Detections(
|
||||||
xyxy=input_boxes, # (n, 4)
|
xyxy=input_boxes, # (n, 4)
|
||||||
mask=masks, # (n, h, w)
|
mask=masks.astype(bool), # (n, h, w)
|
||||||
|
class_id=class_ids
|
||||||
)
|
)
|
||||||
|
|
||||||
box_annotator = sv.BoxAnnotator()
|
box_annotator = sv.BoxAnnotator()
|
||||||
annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections, labels=labels)
|
annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections)
|
||||||
|
|
||||||
|
label_annotator = sv.LabelAnnotator()
|
||||||
|
annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
|
||||||
cv2.imwrite("groundingdino_annotated_image.jpg", annotated_frame)
|
cv2.imwrite("groundingdino_annotated_image.jpg", annotated_frame)
|
||||||
|
|
||||||
mask_annotator = sv.MaskAnnotator()
|
mask_annotator = sv.MaskAnnotator()
|
||||||
annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
|
annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
|
||||||
cv2.imwrite("grounded_sam2_annotated_image_with_mask.jpg", annotated_frame)
|
cv2.imwrite("grounded_sam2_annotated_image_with_mask.jpg", annotated_frame)
|
||||||
|
@@ -2,6 +2,8 @@ import cv2
|
|||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import supervision as sv
|
import supervision as sv
|
||||||
|
from supervision.draw.color import ColorPalette
|
||||||
|
from supervision_utils import CUSTOM_COLOR_MAP
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from sam2.build_sam import build_sam2
|
from sam2.build_sam import build_sam2
|
||||||
from sam2.sam2_image_predictor import SAM2ImagePredictor
|
from sam2.sam2_image_predictor import SAM2ImagePredictor
|
||||||
@@ -89,6 +91,7 @@ elif masks.ndim == 4:
|
|||||||
|
|
||||||
confidences = results[0]["scores"].cpu().numpy().tolist()
|
confidences = results[0]["scores"].cpu().numpy().tolist()
|
||||||
class_names = results[0]["labels"]
|
class_names = results[0]["labels"]
|
||||||
|
class_ids = np.array(list(range(len(class_names))))
|
||||||
|
|
||||||
labels = [
|
labels = [
|
||||||
f"{class_name} {confidence:.2f}"
|
f"{class_name} {confidence:.2f}"
|
||||||
@@ -102,13 +105,21 @@ Visualize image with supervision useful API
|
|||||||
img = cv2.imread(img_path)
|
img = cv2.imread(img_path)
|
||||||
detections = sv.Detections(
|
detections = sv.Detections(
|
||||||
xyxy=input_boxes, # (n, 4)
|
xyxy=input_boxes, # (n, 4)
|
||||||
mask=masks, # (n, h, w)
|
mask=masks.astype(bool), # (n, h, w)
|
||||||
|
class_id=class_ids
|
||||||
)
|
)
|
||||||
|
|
||||||
box_annotator = sv.BoxAnnotator()
|
"""
|
||||||
annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections, labels=labels)
|
Note that if you want to use default color map,
|
||||||
|
you can set color=ColorPalette.DEFAULT
|
||||||
|
"""
|
||||||
|
box_annotator = sv.BoxAnnotator(color=ColorPalette.from_hex(CUSTOM_COLOR_MAP))
|
||||||
|
annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections)
|
||||||
|
|
||||||
|
label_annotator = sv.LabelAnnotator(color=ColorPalette.from_hex(CUSTOM_COLOR_MAP))
|
||||||
|
annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
|
||||||
cv2.imwrite("groundingdino_annotated_image.jpg", annotated_frame)
|
cv2.imwrite("groundingdino_annotated_image.jpg", annotated_frame)
|
||||||
|
|
||||||
mask_annotator = sv.MaskAnnotator()
|
mask_annotator = sv.MaskAnnotator(color=ColorPalette.from_hex(CUSTOM_COLOR_MAP))
|
||||||
annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
|
annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
|
||||||
cv2.imwrite("grounded_sam2_annotated_image_with_mask.jpg", annotated_frame)
|
cv2.imwrite("grounded_sam2_annotated_image_with_mask.jpg", annotated_frame)
|
||||||
|
@@ -64,8 +64,6 @@ masks, scores, logits = sam2_predictor.predict(
|
|||||||
multimask_output=False,
|
multimask_output=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
import pdb; pdb.set_trace()
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Post-process the output of the model to get the masks, scores, and logits for visualization
|
Post-process the output of the model to get the masks, scores, and logits for visualization
|
||||||
"""
|
"""
|
||||||
@@ -81,6 +79,8 @@ elif masks.ndim == 4:
|
|||||||
confidences = confidences.numpy().tolist()
|
confidences = confidences.numpy().tolist()
|
||||||
class_names = labels
|
class_names = labels
|
||||||
|
|
||||||
|
class_ids = np.array(list(range(len(class_names))))
|
||||||
|
|
||||||
labels = [
|
labels = [
|
||||||
f"{class_name} {confidence:.2f}"
|
f"{class_name} {confidence:.2f}"
|
||||||
for class_name, confidence
|
for class_name, confidence
|
||||||
@@ -93,11 +93,15 @@ Visualize image with supervision useful API
|
|||||||
img = cv2.imread(img_path)
|
img = cv2.imread(img_path)
|
||||||
detections = sv.Detections(
|
detections = sv.Detections(
|
||||||
xyxy=input_boxes, # (n, 4)
|
xyxy=input_boxes, # (n, 4)
|
||||||
mask=masks, # (n, h, w)
|
mask=masks.astype(bool), # (n, h, w)
|
||||||
|
class_id=class_ids
|
||||||
)
|
)
|
||||||
|
|
||||||
box_annotator = sv.BoxAnnotator()
|
box_annotator = sv.BoxAnnotator()
|
||||||
annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections, labels=labels)
|
annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections)
|
||||||
|
|
||||||
|
label_annotator = sv.LabelAnnotator()
|
||||||
|
annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
|
||||||
cv2.imwrite("groundingdino_annotated_image.jpg", annotated_frame)
|
cv2.imwrite("groundingdino_annotated_image.jpg", annotated_frame)
|
||||||
|
|
||||||
mask_annotator = sv.MaskAnnotator()
|
mask_annotator = sv.MaskAnnotator()
|
||||||
|
@@ -154,7 +154,9 @@ for frame_idx, segments in video_segments.items():
|
|||||||
class_id=np.array(object_ids, dtype=np.int32),
|
class_id=np.array(object_ids, dtype=np.int32),
|
||||||
)
|
)
|
||||||
box_annotator = sv.BoxAnnotator()
|
box_annotator = sv.BoxAnnotator()
|
||||||
annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections, labels=[ID_TO_OBJECTS[i] for i in object_ids])
|
annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections)
|
||||||
|
label_annotator = sv.LabelAnnotator()
|
||||||
|
annotated_frame = label_annotator.annotate(annotated_frame, detections=detections, labels=[ID_TO_OBJECTS[i] for i in object_ids])
|
||||||
mask_annotator = sv.MaskAnnotator()
|
mask_annotator = sv.MaskAnnotator()
|
||||||
annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
|
annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
|
||||||
cv2.imwrite(os.path.join(save_dir, f"annotated_frame_{frame_idx:05d}.jpg"), annotated_frame)
|
cv2.imwrite(os.path.join(save_dir, f"annotated_frame_{frame_idx:05d}.jpg"), annotated_frame)
|
||||||
|
@@ -13,8 +13,7 @@ import numpy as np
|
|||||||
import supervision as sv
|
import supervision as sv
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from sam2.build_sam import build_sam2_video_predictor, build_sam2
|
from sam2.build_sam import build_sam2_video_predictor, build_sam2
|
||||||
from sam2.sam2_image_predictor import SAM2ImagePredictor
|
from sam2.sam2_image_predictor import SAM2ImagePredictor
|
||||||
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
|
||||||
from track_utils import sample_points_from_masks
|
from track_utils import sample_points_from_masks
|
||||||
from video_utils import create_video_from_images
|
from video_utils import create_video_from_images
|
||||||
|
|
||||||
@@ -177,7 +176,9 @@ for frame_idx, segments in video_segments.items():
|
|||||||
class_id=np.array(object_ids, dtype=np.int32),
|
class_id=np.array(object_ids, dtype=np.int32),
|
||||||
)
|
)
|
||||||
box_annotator = sv.BoxAnnotator()
|
box_annotator = sv.BoxAnnotator()
|
||||||
annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections, labels=[ID_TO_OBJECTS[i] for i in object_ids])
|
annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections)
|
||||||
|
label_annotator = sv.LabelAnnotator()
|
||||||
|
annotated_frame = label_annotator.annotate(annotated_frame, detections=detections, labels=[ID_TO_OBJECTS[i] for i in object_ids])
|
||||||
mask_annotator = sv.MaskAnnotator()
|
mask_annotator = sv.MaskAnnotator()
|
||||||
annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
|
annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
|
||||||
cv2.imwrite(os.path.join(save_dir, f"annotated_frame_{frame_idx:05d}.jpg"), annotated_frame)
|
cv2.imwrite(os.path.join(save_dir, f"annotated_frame_{frame_idx:05d}.jpg"), annotated_frame)
|
||||||
|
18
supervision_utils.py
Normal file
18
supervision_utils.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
CUSTOM_COLOR_MAP = [
|
||||||
|
"#e6194b",
|
||||||
|
"#3cb44b",
|
||||||
|
"#ffe119",
|
||||||
|
"#0082c8",
|
||||||
|
"#f58231",
|
||||||
|
"#911eb4",
|
||||||
|
"#46f0f0",
|
||||||
|
"#f032e6",
|
||||||
|
"#d2f53c",
|
||||||
|
"#fabebe",
|
||||||
|
"#008080",
|
||||||
|
"#e6beff",
|
||||||
|
"#aa6e28",
|
||||||
|
"#fffac8",
|
||||||
|
"#800000",
|
||||||
|
"#aaffc3",
|
||||||
|
]
|
Reference in New Issue
Block a user