add open-vocab demo

2024-08-19 00:22:47 +08:00
parent 5f886743d9
commit afa91ca407
2 changed files with 92 additions and 1 deletions
--- a/README.md
+++ b/README.md
@@ -283,6 +283,13 @@ python grounded_sam2_image_demo_florence2.py \
    --text_input "The left red car."
 ```
 **Open-Vocabulary Detection and Segmentation**
 ```bash
 python grounded_sam2_image_demo_florence2.py \
    --pipeline open_vocabulary_detection_segmentation \
    --image_path ./notebooks/images/cars.jpg \
    --text_input "two cars"
 ```
 ### Citation
--- a/grounded_sam2_image_demo_florence2.py
+++ b/grounded_sam2_image_demo_florence2.py
@@ -513,6 +513,81 @@ def referring_expression_segmentation(
    print(f'Successfully save sam2 annotated image to "{output_dir}"')
 """
 Pipeline 6: Open-Vocabulary Detection + Segmentation
 """
 def open_vocabulary_detection_and_segmentation(
    florence2_model,
    florence2_processor,
    sam2_predictor,
    image_path,
    task_prompt="<OPEN_VOCABULARY_DETECTION>",
    text_input=None,
    output_dir=OUTPUT_DIR
 ):
    # run florence-2 object detection in demo
    image = Image.open(image_path).convert("RGB")
    results = run_florence2(task_prompt, text_input, florence2_model, florence2_processor, image)
    """ Florence-2 Open-Vocabulary Detection Output Format
    {'<OPEN_VOCABULARY_DETECTION>': 
        {
            'bboxes': 
                [
                    [34.23999786376953, 159.1199951171875, 582.0800170898438, 374.6399841308594]
                ], 
            'bboxes_labels': ['A green car'],
            'polygons': [], 
            'polygons_labels': []
        }
    }
    """
    assert text_input is not None, "Text input should not be none when calling open-vocabulary detection pipeline."
    results = results[task_prompt]
    # parse florence-2 detection results
    input_boxes = np.array(results["bboxes"])
    print(results)
    class_names = results["bboxes_labels"]
    class_ids = np.array(list(range(len(class_names))))
    # predict mask with SAM 2
    sam2_predictor.set_image(np.array(image))
    masks, scores, logits = sam2_predictor.predict(
        point_coords=None,
        point_labels=None,
        box=input_boxes,
        multimask_output=False,
    )
    if masks.ndim == 4:
        masks = masks.squeeze(1)
    # specify labels
    labels = [
        f"{class_name}" for class_name in class_names
    ]
    # visualization results
    img = cv2.imread(image_path)
    detections = sv.Detections(
        xyxy=input_boxes,
        mask=masks.astype(bool),
        class_id=class_ids
    )
    box_annotator = sv.BoxAnnotator()
    annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections)
    label_annotator = sv.LabelAnnotator()
    annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
    cv2.imwrite(os.path.join(output_dir, "grounded_sam2_florence2_open_vocabulary_detection.jpg"), annotated_frame)
    mask_annotator = sv.MaskAnnotator()
    annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
    cv2.imwrite(os.path.join(output_dir, "grounded_sam2_florence2_open_vocabulary_detection_with_mask.jpg"), annotated_frame)
    print(f'Successfully save annotated image to "{output_dir}"')
 if __name__ == "__main__":
    parser = argparse.ArgumentParser("Grounded SAM 2 Florence-2 Demos", add_help=True)
@@ -561,7 +636,7 @@ if __name__ == "__main__":
            text_input=INPUT_TEXT
        )
    elif PIPELINE == "referring_expression_segmentation":
-        # pipeline-5: referring segmentation + sam2 segmentation
+        # pipeline-5: referring segmentation + segmentation
        referring_expression_segmentation(
            florence2_model=florence2_model,
            florence2_processor=florence2_processor,
@@ -569,5 +644,14 @@ if __name__ == "__main__":
            image_path=IMAGE_PATH,
            text_input=INPUT_TEXT
        )
    elif PIPELINE == "open_vocabulary_detection_segmentation":
        # pipeline-6: open-vocabulary detection + segmentation
        open_vocabulary_detection_and_segmentation(
            florence2_model=florence2_model,
            florence2_processor=florence2_processor,
            sam2_predictor=sam2_predictor,
            image_path=IMAGE_PATH,
            text_input=INPUT_TEXT
        )
    else:
        raise NotImplementedError(f"Pipeline: {args.pipeline} is not implemented at this time")