import os
import cv2
import torch
import argparse
import numpy as np
import supervision as sv
from PIL import Image
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor
from transformers import AutoProcessor, AutoModelForCausalLM
from utils.supervision_utils import CUSTOM_COLOR_MAP
"""
Define Some Hyperparam
"""
TASK_PROMPT = {
"caption": "
",
"detailed_caption": "",
"more_detailed_caption": "",
"dense_region_caption": "",
"region_proposal": "",
"phrase_grounding": "",
"referring_expression_segmentation": "",
"region_to_segmentation": "",
"open_vocabulary_detection": "",
"region_to_category": "",
"region_to_description": "",
"ocr": "",
"ocr_with_region": "",
}
OUTPUT_DIR = "./outputs"
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR, exist_ok=True)
"""
Init Florence-2 and SAM 2 Model
"""
FLORENCE2_MODEL_ID = "microsoft/Florence-2-large"
SAM2_CHECKPOINT = "./checkpoints/sam2_hiera_large.pt"
SAM2_CONFIG = "sam2_hiera_l.yaml"
# environment settings
# use bfloat16
torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
if torch.cuda.get_device_properties(0).major >= 8:
# turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# build florence-2
florence2_model = AutoModelForCausalLM.from_pretrained(FLORENCE2_MODEL_ID, trust_remote_code=True, torch_dtype='auto').eval().to(device)
florence2_processor = AutoProcessor.from_pretrained(FLORENCE2_MODEL_ID, trust_remote_code=True)
# build sam 2
sam2_model = build_sam2(SAM2_CONFIG, SAM2_CHECKPOINT, device=device)
sam2_predictor = SAM2ImagePredictor(sam2_model)
def run_florence2(task_prompt, text_input, model, processor, image):
assert model is not None, "You should pass the init florence-2 model here"
assert processor is not None, "You should set florence-2 processor here"
device = model.device
if text_input is None:
prompt = task_prompt
else:
prompt = task_prompt + text_input
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch.float16)
generated_ids = model.generate(
input_ids=inputs["input_ids"].to(device),
pixel_values=inputs["pixel_values"].to(device),
max_new_tokens=1024,
early_stopping=False,
do_sample=False,
num_beams=3,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = processor.post_process_generation(
generated_text,
task=task_prompt,
image_size=(image.width, image.height)
)
return parsed_answer
"""
We try to support a series of cascaded auto-labelling pipelines with Florence-2 and SAM 2
"""
"""
Auto-Labelling Pipeline: Caption/Detailed Caption/More Detailed Caption + Phrase Grounding + Segmentation
"""
def caption_phrase_grounding_and_segmentation(
florence2_model,
florence2_processor,
sam2_predictor,
image_path,
caption_task_prompt='',
output_dir=OUTPUT_DIR
):
assert caption_task_prompt in ["", "", ""]
image = Image.open(image_path).convert("RGB")
# image caption
caption_results = run_florence2(caption_task_prompt, None, florence2_model, florence2_processor, image)
text_input = caption_results[caption_task_prompt]
print(f'Image caption for "{image_path}": ', text_input)
# phrase grounding
grounding_results = run_florence2('', text_input, florence2_model, florence2_processor, image)
grounding_results = grounding_results['']
# parse florence-2 detection results
input_boxes = np.array(grounding_results["bboxes"])
class_names = grounding_results["labels"]
class_ids = np.array(list(range(len(class_names))))
# predict mask with SAM 2
sam2_predictor.set_image(np.array(image))
masks, scores, logits = sam2_predictor.predict(
point_coords=None,
point_labels=None,
box=input_boxes,
multimask_output=False,
)
if masks.ndim == 4:
masks = masks.squeeze(1)
# specify labels
labels = [
f"{class_name}" for class_name in class_names
]
# visualization results
img = cv2.imread(image_path)
detections = sv.Detections(
xyxy=input_boxes,
mask=masks.astype(bool),
class_id=class_ids
)
box_annotator = sv.BoxAnnotator()
annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections)
label_annotator = sv.LabelAnnotator()
annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
cv2.imwrite(os.path.join(output_dir, "grounded_sam2_florence2_auto_labelling.jpg"), annotated_frame)
mask_annotator = sv.MaskAnnotator()
annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
cv2.imwrite(os.path.join(output_dir, "grounded_sam2_florence2_auto_labelling_with_mask.jpg"), annotated_frame)
print(f'Successfully save annotated image to "{output_dir}"')
if __name__ == "__main__":
parser = argparse.ArgumentParser("Grounded SAM 2 Florence-2 Demos", add_help=True)
parser.add_argument("--image_path", type=str, default="./notebooks/images/cars.jpg", required=True, help="path to image file")
parser.add_argument("--pipeline", type=str, default="caption_to_phrase_grounding", required=True, help="pipeline to use")
parser.add_argument("--caption_type", type=str, default="caption", required=False, help="granularity of caption")
args = parser.parse_args()
CAPTION_TO_TASK_PROMPT = {
"caption": "",
"detailed_caption": "",
"more_detailed_caption": ""
}
IMAGE_PATH = args.image_path
PIPELINE = args.pipeline
CAPTION_TYPE = args.caption_type
assert CAPTION_TYPE in ["caption", "detailed_caption", "more_detailed_caption"]
print(f"Running pipeline: {PIPELINE} now.")
if PIPELINE == "caption_to_phrase_grounding":
# pipeline-1: caption + phrase grounding + segmentation
caption_phrase_grounding_and_segmentation(
florence2_model=florence2_model,
florence2_processor=florence2_processor,
sam2_predictor=sam2_predictor,
caption_task_prompt=CAPTION_TO_TASK_PROMPT[CAPTION_TYPE],
image_path=IMAGE_PATH
)
else:
raise NotImplementedError(f"Pipeline: {args.pipeline} is not implemented at this time")