diff --git a/README.md b/README.md index ec7f81d..09bdb76 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Grounded SAM 2 does not introduce significant methodological changes compared to - [Grounded SAM 2 Video Object Tracking with Custom Video Input (using Grounding DINO 1.5 & 1.6)](#grounded-sam-2-video-object-tracking-demo-with-custom-video-input-with-grounding-dino-15--16) - [Grounded SAM 2 Video Object Tracking with Continues ID (using Grounding DINO)](#grounded-sam-2-video-object-tracking-with-continuous-id-with-grounding-dino) - [Grounded SAM 2 Florence-2 Demos](#grounded-sam-2-florence-2-demos) - - [Grounded SAM 2 Florence-2 Image Demo](#grounded-sam-2-florence-2-image-demo) + - [Grounded SAM 2 Florence-2 Image Demo (Updating)](#grounded-sam-2-florence-2-image-demo-updating) - [Citation](#citation) @@ -236,28 +236,48 @@ In this section, we will explore how to integrate the feature-rich and robust op | Dense Region Caption | `` | ✘ | Detect main objects with short description | | Region Proposal | `` | ✘ | Generate proposals without category name | | Phrase Grounding | `` | ✔ | Ground main objects in image mentioned in caption | +| Referring Expression Segmentation | `` | ✔ | Ground the object which is most related to the text input | Integrate `Florence-2` with `SAM-2`, we can build a strong vision pipeline to solve complex vision tasks, you can try the following scripts to run the demo: **Object Detection and Segmentation** ```bash -python grounded_sam2_image_demo_florence2.py --pipeline object_detection_segmentation --image_path ./notebooks/images/cars.jpg +python grounded_sam2_image_demo_florence2.py \ + --pipeline object_detection_segmentation \ + --image_path ./notebooks/images/cars.jpg ``` **Dense Region Caption and Segmentation** ```bash -python grounded_sam2_image_demo_florence2.py --pipeline dense_region_caption_segmentation --image_path ./notebooks/images/cars.jpg +python grounded_sam2_image_demo_florence2.py \ + --pipeline dense_region_caption_segmentation \ + --image_path ./notebooks/images/cars.jpg ``` **Region Proposal and Segmentation** ```bash -python grounded_sam2_image_demo_florence2.py --pipeline region_proposal_segmentation --image_path ./notebooks/images/cars.jpg +python grounded_sam2_image_demo_florence2.py \ + --pipeline region_proposal_segmentation \ + --image_path ./notebooks/images/cars.jpg ``` **Phrase Grounding and Segmentation** ```bash -python grounded_sam2_image_demo_florence2.py --pipeline phrase_grounding_and_segmentation --image_path ./notebooks/images/cars.jpg +python grounded_sam2_image_demo_florence2.py \ + --pipeline phrase_grounding_segmentation \ + --image_path ./notebooks/images/cars.jpg \ + --text_input "The image shows two vintage Chevrolet cars parked side by side, with one being a red convertible and the other a pink sedan, \ + set against the backdrop of an urban area with a multi-story building and trees. \ + The cars have Cuban license plates, indicating a location likely in Cuba." +``` + +**Referring Expression Segmentation** +```bash +python grounded_sam2_image_demo_florence2.py \ + --pipeline referring_expression_segmentation \ + --image_path ./notebooks/images/cars.jpg \ + --text_input "The left red car." ``` diff --git a/grounded_sam2_image_demo_florence2.py b/grounded_sam2_image_demo_florence2.py index c5ed2c5..4a19548 100644 --- a/grounded_sam2_image_demo_florence2.py +++ b/grounded_sam2_image_demo_florence2.py @@ -109,6 +109,7 @@ def object_detection_and_segmentation( text_input=None, output_dir=OUTPUT_DIR ): + assert text_input is None, "Text input should not be none when calling object detection pipeline." # run florence-2 object detection in demo image = Image.open(image_path).convert("RGB") results = run_florence2(task_prompt, text_input, florence2_model, florence2_processor, image) @@ -184,6 +185,7 @@ def dense_region_caption_and_segmentation( text_input=None, output_dir=OUTPUT_DIR ): + assert text_input is None, "Text input should not be none when calling dense region caption pipeline." # run florence-2 object detection in demo image = Image.open(image_path).convert("RGB") results = run_florence2(task_prompt, text_input, florence2_model, florence2_processor, image) @@ -260,6 +262,7 @@ def region_proposal_and_segmentation( text_input=None, output_dir=OUTPUT_DIR ): + assert text_input is None, "Text input should not be none when calling region proposal pipeline." # run florence-2 object detection in demo image = Image.open(image_path).convert("RGB") results = run_florence2(task_prompt, text_input, florence2_model, florence2_processor, image) @@ -398,15 +401,129 @@ def phrase_grounding_and_segmentation( print(f'Successfully save annotated image to "{output_dir}"') +""" +Pipeline 5: Referring Expression Segmentation + +Note that Florence-2 directly support referring segmentation with polygon output format, which may be not that accurate, +therefore we try to decode box from polygon and use SAM 2 for mask prediction +""" +def referring_expression_segmentation( + florence2_model, + florence2_processor, + sam2_predictor, + image_path, + task_prompt="", + text_input=None, + output_dir=OUTPUT_DIR +): + # run florence-2 object detection in demo + image = Image.open(image_path).convert("RGB") + results = run_florence2(task_prompt, text_input, florence2_model, florence2_processor, image) + + """ Florence-2 Object Detection Output Format + {'': + { + 'polygons': [[[...]]] + 'labels': [''] + } + } + """ + assert text_input is not None, "Text input should not be none when calling referring segmentation pipeline." + results = results[task_prompt] + # parse florence-2 detection results + polygon_points = np.array(results["polygons"][0], dtype=np.int32).reshape(-1, 2) + class_names = [text_input] + class_ids = np.array(list(range(len(class_names)))) + + # parse polygon format to mask + img_width, img_height = image.size[0], image.size[1] + florence2_mask = np.zeros((img_height, img_width), dtype=np.uint8) + if len(polygon_points) < 3: + print("Invalid polygon:", polygon_points) + exit() + cv2.fillPoly(florence2_mask, [polygon_points], 1) + if florence2_mask.ndim == 2: + florence2_mask = florence2_mask[None] + + # compute bounding box based on polygon points + x_min = np.min(polygon_points[:, 0]) + y_min = np.min(polygon_points[:, 1]) + x_max = np.max(polygon_points[:, 0]) + y_max = np.max(polygon_points[:, 1]) + + input_boxes = np.array([[x_min, y_min, x_max, y_max]]) + + # predict mask with SAM 2 + sam2_predictor.set_image(np.array(image)) + sam2_masks, scores, logits = sam2_predictor.predict( + point_coords=None, + point_labels=None, + box=input_boxes, + multimask_output=False, + ) + + if sam2_masks.ndim == 4: + sam2_masks = sam2_masks.squeeze(1) + + # specify labels + labels = [ + f"{class_name}" for class_name in class_names + ] + + # visualization florence2 mask + img = cv2.imread(image_path) + detections = sv.Detections( + xyxy=input_boxes, + mask=florence2_mask.astype(bool), + class_id=class_ids + ) + + box_annotator = sv.BoxAnnotator() + annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections) + + label_annotator = sv.LabelAnnotator() + annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels) + cv2.imwrite(os.path.join(output_dir, "florence2_referring_segmentation_box.jpg"), annotated_frame) + + mask_annotator = sv.MaskAnnotator() + annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections) + cv2.imwrite(os.path.join(output_dir, "florence2_referring_segmentation_box_with_mask.jpg"), annotated_frame) + + print(f'Successfully save florence-2 annotated image to "{output_dir}"') + + # visualize sam2 mask + img = cv2.imread(image_path) + detections = sv.Detections( + xyxy=input_boxes, + mask=sam2_masks.astype(bool), + class_id=class_ids + ) + + box_annotator = sv.BoxAnnotator() + annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections) + + label_annotator = sv.LabelAnnotator() + annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels) + cv2.imwrite(os.path.join(output_dir, "grounded_sam2_florence2_referring_box.jpg"), annotated_frame) + + mask_annotator = sv.MaskAnnotator() + annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections) + cv2.imwrite(os.path.join(output_dir, "grounded_sam2_florence2_referring_box_with_sam2_mask.jpg"), annotated_frame) + + print(f'Successfully save sam2 annotated image to "{output_dir}"') + + if __name__ == "__main__": parser = argparse.ArgumentParser("Grounded SAM 2 Florence-2 Demos", add_help=True) parser.add_argument("--image_path", type=str, default="./notebooks/images/cars.jpg", required=True, help="path to image file") parser.add_argument("--pipeline", type=str, default="object_detection_segmentation", required=True, help="path to image file") + parser.add_argument("--text_input", type=str, default=None, required=False, help="path to image file") args = parser.parse_args() IMAGE_PATH = args.image_path PIPELINE = args.pipeline + INPUT_TEXT = args.text_input print(f"Running pipeline: {PIPELINE} now.") @@ -441,9 +558,16 @@ if __name__ == "__main__": florence2_processor=florence2_processor, sam2_predictor=sam2_predictor, image_path=IMAGE_PATH, - text_input="The image shows two vintage Chevrolet cars parked side by side, with one being a red convertible and the other a pink sedan, \ - set against the backdrop of an urban area with a multi-story building and trees. \ - The cars have Cuban license plates, indicating a location likely in Cuba." + text_input=INPUT_TEXT + ) + elif PIPELINE == "referring_expression_segmentation": + # pipeline-5: referring segmentation + sam2 segmentation + referring_expression_segmentation( + florence2_model=florence2_model, + florence2_processor=florence2_processor, + sam2_predictor=sam2_predictor, + image_path=IMAGE_PATH, + text_input=INPUT_TEXT ) else: raise NotImplementedError(f"Pipeline: {args.pipeline} is not implemented at this time") \ No newline at end of file