diff --git a/README.md b/README.md
index ec7f81d..09bdb76 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ Grounded SAM 2 does not introduce significant methodological changes compared to
   - [Grounded SAM 2 Video Object Tracking with Custom Video Input (using Grounding DINO 1.5 & 1.6)](#grounded-sam-2-video-object-tracking-demo-with-custom-video-input-with-grounding-dino-15--16)
   - [Grounded SAM 2 Video Object Tracking with Continues ID (using Grounding DINO)](#grounded-sam-2-video-object-tracking-with-continuous-id-with-grounding-dino)
 - [Grounded SAM 2 Florence-2 Demos](#grounded-sam-2-florence-2-demos)
-  - [Grounded SAM 2 Florence-2 Image Demo](#grounded-sam-2-florence-2-image-demo)
+  - [Grounded SAM 2 Florence-2 Image Demo (Updating)](#grounded-sam-2-florence-2-image-demo-updating)
 - [Citation](#citation)
 
 
@@ -236,28 +236,48 @@ In this section, we will explore how to integrate the feature-rich and robust op
 | Dense Region Caption | `<DENSE_REGION_CAPTION>` | &#10008; | Detect main objects with short description |
 | Region Proposal | `<REGION_PROPOSAL>` | &#10008; | Generate proposals without category name |
 | Phrase Grounding | `<CAPTION_TO_PHRASE_GROUNDING>` | &#10004; | Ground main objects in image mentioned in caption |
+| Referring Expression Segmentation | `<REFERRING_EXPRESSION_SEGMENTATION>` | &#10004; | Ground the object which is most related to the text input |
 
 
 Integrate `Florence-2` with `SAM-2`, we can build a strong vision pipeline to solve complex vision tasks, you can try the following scripts to run the demo:
 
 **Object Detection and Segmentation**
 ```bash
-python grounded_sam2_image_demo_florence2.py --pipeline object_detection_segmentation --image_path ./notebooks/images/cars.jpg
+python grounded_sam2_image_demo_florence2.py \
+    --pipeline object_detection_segmentation \
+    --image_path ./notebooks/images/cars.jpg
 ```
 
 **Dense Region Caption and Segmentation**
 ```bash
-python grounded_sam2_image_demo_florence2.py --pipeline dense_region_caption_segmentation --image_path ./notebooks/images/cars.jpg
+python grounded_sam2_image_demo_florence2.py \
+    --pipeline dense_region_caption_segmentation \
+    --image_path ./notebooks/images/cars.jpg
 ```
 
 **Region Proposal and Segmentation**
 ```bash
-python grounded_sam2_image_demo_florence2.py --pipeline region_proposal_segmentation --image_path ./notebooks/images/cars.jpg
+python grounded_sam2_image_demo_florence2.py \
+    --pipeline region_proposal_segmentation \
+    --image_path ./notebooks/images/cars.jpg
 ```
 
 **Phrase Grounding and Segmentation**
 ```bash
-python grounded_sam2_image_demo_florence2.py --pipeline phrase_grounding_and_segmentation --image_path ./notebooks/images/cars.jpg
+python grounded_sam2_image_demo_florence2.py \
+    --pipeline phrase_grounding_segmentation \
+    --image_path ./notebooks/images/cars.jpg \
+    --text_input "The image shows two vintage Chevrolet cars parked side by side, with one being a red convertible and the other a pink sedan, \
+            set against the backdrop of an urban area with a multi-story building and trees. \
+            The cars have Cuban license plates, indicating a location likely in Cuba."
+```
+
+**Referring Expression Segmentation**
+```bash
+python grounded_sam2_image_demo_florence2.py \
+    --pipeline referring_expression_segmentation \
+    --image_path ./notebooks/images/cars.jpg \
+    --text_input "The left red car."
 ```
 
 
diff --git a/grounded_sam2_image_demo_florence2.py b/grounded_sam2_image_demo_florence2.py
index c5ed2c5..4a19548 100644
--- a/grounded_sam2_image_demo_florence2.py
+++ b/grounded_sam2_image_demo_florence2.py
@@ -109,6 +109,7 @@ def object_detection_and_segmentation(
     text_input=None,
     output_dir=OUTPUT_DIR
 ):
+    assert text_input is None, "Text input should not be none when calling object detection pipeline."
     # run florence-2 object detection in demo
     image = Image.open(image_path).convert("RGB")
     results = run_florence2(task_prompt, text_input, florence2_model, florence2_processor, image)
@@ -184,6 +185,7 @@ def dense_region_caption_and_segmentation(
     text_input=None,
     output_dir=OUTPUT_DIR
 ):
+    assert text_input is None, "Text input should not be none when calling dense region caption pipeline."
     # run florence-2 object detection in demo
     image = Image.open(image_path).convert("RGB")
     results = run_florence2(task_prompt, text_input, florence2_model, florence2_processor, image)
@@ -260,6 +262,7 @@ def region_proposal_and_segmentation(
     text_input=None,
     output_dir=OUTPUT_DIR
 ):
+    assert text_input is None, "Text input should not be none when calling region proposal pipeline."
     # run florence-2 object detection in demo
     image = Image.open(image_path).convert("RGB")
     results = run_florence2(task_prompt, text_input, florence2_model, florence2_processor, image)
@@ -398,15 +401,129 @@ def phrase_grounding_and_segmentation(
     print(f'Successfully save annotated image to "{output_dir}"')
 
 
+"""
+Pipeline 5: Referring Expression Segmentation
+
+Note that Florence-2 directly support referring segmentation with polygon output format, which may be not that accurate, 
+therefore we try to decode box from polygon and use SAM 2 for mask prediction
+"""
+def referring_expression_segmentation(
+    florence2_model,
+    florence2_processor,
+    sam2_predictor,
+    image_path,
+    task_prompt="<REFERRING_EXPRESSION_SEGMENTATION>",
+    text_input=None,
+    output_dir=OUTPUT_DIR
+):
+    # run florence-2 object detection in demo
+    image = Image.open(image_path).convert("RGB")
+    results = run_florence2(task_prompt, text_input, florence2_model, florence2_processor, image)
+    
+    """ Florence-2 Object Detection Output Format
+    {'<REFERRING_EXPRESSION_SEGMENTATION>': 
+        {
+            'polygons': [[[...]]]
+            'labels': ['']
+        }
+    }
+    """
+    assert text_input is not None, "Text input should not be none when calling referring segmentation pipeline."
+    results = results[task_prompt]
+    # parse florence-2 detection results
+    polygon_points = np.array(results["polygons"][0], dtype=np.int32).reshape(-1, 2)
+    class_names = [text_input]
+    class_ids = np.array(list(range(len(class_names))))
+    
+    # parse polygon format to mask
+    img_width, img_height = image.size[0], image.size[1]
+    florence2_mask = np.zeros((img_height, img_width), dtype=np.uint8)
+    if len(polygon_points) < 3:
+        print("Invalid polygon:", polygon_points)
+        exit()
+    cv2.fillPoly(florence2_mask, [polygon_points], 1)
+    if florence2_mask.ndim == 2:
+        florence2_mask = florence2_mask[None]
+
+    # compute bounding box based on polygon points
+    x_min = np.min(polygon_points[:, 0])
+    y_min = np.min(polygon_points[:, 1])
+    x_max = np.max(polygon_points[:, 0])
+    y_max = np.max(polygon_points[:, 1])
+
+    input_boxes = np.array([[x_min, y_min, x_max, y_max]])
+
+    # predict mask with SAM 2
+    sam2_predictor.set_image(np.array(image))
+    sam2_masks, scores, logits = sam2_predictor.predict(
+        point_coords=None,
+        point_labels=None,
+        box=input_boxes,
+        multimask_output=False,
+    )
+    
+    if sam2_masks.ndim == 4:
+        sam2_masks = sam2_masks.squeeze(1)
+    
+    # specify labels
+    labels = [
+        f"{class_name}" for class_name in class_names
+    ]
+    
+    # visualization florence2 mask
+    img = cv2.imread(image_path)
+    detections = sv.Detections(
+        xyxy=input_boxes,
+        mask=florence2_mask.astype(bool),
+        class_id=class_ids
+    )
+    
+    box_annotator = sv.BoxAnnotator()
+    annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections)
+    
+    label_annotator = sv.LabelAnnotator()
+    annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
+    cv2.imwrite(os.path.join(output_dir, "florence2_referring_segmentation_box.jpg"), annotated_frame)
+    
+    mask_annotator = sv.MaskAnnotator()
+    annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
+    cv2.imwrite(os.path.join(output_dir, "florence2_referring_segmentation_box_with_mask.jpg"), annotated_frame)
+
+    print(f'Successfully save florence-2 annotated image to "{output_dir}"')
+
+    # visualize sam2 mask
+    img = cv2.imread(image_path)
+    detections = sv.Detections(
+        xyxy=input_boxes,
+        mask=sam2_masks.astype(bool),
+        class_id=class_ids
+    )
+
+    box_annotator = sv.BoxAnnotator()
+    annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections)
+    
+    label_annotator = sv.LabelAnnotator()
+    annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
+    cv2.imwrite(os.path.join(output_dir, "grounded_sam2_florence2_referring_box.jpg"), annotated_frame)
+    
+    mask_annotator = sv.MaskAnnotator()
+    annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
+    cv2.imwrite(os.path.join(output_dir, "grounded_sam2_florence2_referring_box_with_sam2_mask.jpg"), annotated_frame)
+
+    print(f'Successfully save sam2 annotated image to "{output_dir}"')
+
+
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser("Grounded SAM 2 Florence-2 Demos", add_help=True)
     parser.add_argument("--image_path", type=str, default="./notebooks/images/cars.jpg", required=True, help="path to image file")
     parser.add_argument("--pipeline", type=str, default="object_detection_segmentation", required=True, help="path to image file")
+    parser.add_argument("--text_input", type=str, default=None, required=False, help="path to image file")
     args = parser.parse_args()
 
     IMAGE_PATH = args.image_path
     PIPELINE = args.pipeline
+    INPUT_TEXT = args.text_input
 
     print(f"Running pipeline: {PIPELINE} now.")
 
@@ -441,9 +558,16 @@ if __name__ == "__main__":
             florence2_processor=florence2_processor,
             sam2_predictor=sam2_predictor,
             image_path=IMAGE_PATH,
-            text_input="The image shows two vintage Chevrolet cars parked side by side, with one being a red convertible and the other a pink sedan, \
-            set against the backdrop of an urban area with a multi-story building and trees. \
-            The cars have Cuban license plates, indicating a location likely in Cuba."
+            text_input=INPUT_TEXT
+        )
+    elif PIPELINE == "referring_expression_segmentation":
+        # pipeline-5: referring segmentation + sam2 segmentation
+        referring_expression_segmentation(
+            florence2_model=florence2_model,
+            florence2_processor=florence2_processor,
+            sam2_predictor=sam2_predictor,
+            image_path=IMAGE_PATH,
+            text_input=INPUT_TEXT
         )
     else:
         raise NotImplementedError(f"Pipeline: {args.pipeline} is not implemented at this time")
\ No newline at end of file