support gsam2 image predictor model

2024-08-01 17:05:01 +08:00
parent 72501fecf8
commit 1dacb47840
333 changed files with 24764 additions and 0 deletions
--- a/sav_dataset/utils/sav_benchmark.py
+++ b/sav_dataset/utils/sav_benchmark.py
@@ -0,0 +1,488 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the sav_dataset directory of this source tree.
+
+# adapted from https://github.com/hkchengrex/vos-benchmark
+# and  https://github.com/davisvideochallenge/davis2017-evaluation
+# with their licenses found in the LICENSE_VOS_BENCHMARK and LICENSE_DAVIS files
+# in the sav_dataset directory.
+import math
+import os
+import time
+from collections import defaultdict
+from multiprocessing import Pool
+from os import path
+from typing import Dict, List, Tuple
+
+import cv2
+import numpy as np
+import tqdm
+from PIL import Image
+from skimage.morphology import disk
+
+
+class VideoEvaluator:
+    def __init__(self, gt_root, pred_root, skip_first_and_last=True) -> None:
+        """
+        gt_root: path to the folder storing the gt masks
+        pred_root: path to the folder storing the predicted masks
+        skip_first_and_last: whether we should skip the evaluation of the first and the last frame.
+                             True for SA-V val and test, same as in DAVIS semi-supervised evaluation.
+        """
+        self.gt_root = gt_root
+        self.pred_root = pred_root
+        self.skip_first_and_last = skip_first_and_last
+
+    def __call__(self, vid_name: str) -> Tuple[str, Dict[str, float], Dict[str, float]]:
+        """
+        vid_name: name of the video to evaluate
+        """
+
+        # scan the folder to find subfolders for evaluation and
+        # check if the folder structure is SA-V
+        to_evaluate, is_sav_format = self.scan_vid_folder(vid_name)
+
+        # evaluate each (gt_path, pred_path) pair
+        eval_results = []
+        for all_frames, obj_id, gt_path, pred_path in to_evaluate:
+            if self.skip_first_and_last:
+                # skip the first and the last frames
+                all_frames = all_frames[1:-1]
+
+            evaluator = Evaluator(name=vid_name, obj_id=obj_id)
+            for frame in all_frames:
+                gt_array, pred_array = self.get_gt_and_pred(
+                    gt_path, pred_path, frame, is_sav_format
+                )
+                evaluator.feed_frame(mask=pred_array, gt=gt_array)
+
+            iou, boundary_f = evaluator.conclude()
+            eval_results.append((obj_id, iou, boundary_f))
+
+        if is_sav_format:
+            iou_output, boundary_f_output = self.consolidate(eval_results)
+        else:
+            assert len(eval_results) == 1
+            iou_output = eval_results[0][1]
+            boundary_f_output = eval_results[0][2]
+
+        return vid_name, iou_output, boundary_f_output
+
+    def get_gt_and_pred(
+        self,
+        gt_path: str,
+        pred_path: str,
+        f_name: str,
+        is_sav_format: bool,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Get the ground-truth and predicted masks for a single frame.
+        """
+        gt_mask_path = path.join(gt_path, f_name)
+        pred_mask_path = path.join(pred_path, f_name)
+        assert os.path.exists(pred_mask_path), f"{pred_mask_path} not found"
+
+        gt_array = np.array(Image.open(gt_mask_path))
+        pred_array = np.array(Image.open(pred_mask_path))
+        assert (
+            gt_array.shape[-2:] == pred_array.shape[-2:]
+        ), f"shape mismatch: {gt_mask_path}, {pred_mask_path}"
+
+        if is_sav_format:
+            assert len(np.unique(gt_array)) <= 2, (
+                f"found more than 1 object in {gt_mask_path} "
+                "SA-V format assumes one object mask per png file."
+            )
+            assert len(np.unique(pred_array)) <= 2, (
+                f"found more than 1 object in {pred_mask_path} "
+                "SA-V format assumes one object mask per png file."
+            )
+            gt_array = gt_array > 0
+            pred_array = pred_array > 0
+
+        return gt_array, pred_array
+
+    def scan_vid_folder(self, vid_name) -> Tuple[List, bool]:
+        """
+        Scan the folder structure of the video and return a list of folders for evaluate.
+        """
+
+        vid_gt_path = path.join(self.gt_root, vid_name)
+        vid_pred_path = path.join(self.pred_root, vid_name)
+        all_files_and_dirs = sorted(os.listdir(vid_gt_path))
+        to_evaluate = []
+        if all(name.endswith(".png") for name in all_files_and_dirs):
+            # All files are png files, dataset structure similar to DAVIS
+            is_sav_format = False
+            frames = all_files_and_dirs
+            obj_dir = None
+            to_evaluate.append((frames, obj_dir, vid_gt_path, vid_pred_path))
+        else:
+            # SA-V dataset structure, going one layer down into each subdirectory
+            is_sav_format = True
+            for obj_dir in all_files_and_dirs:
+                obj_gt_path = path.join(vid_gt_path, obj_dir)
+                obj_pred_path = path.join(vid_pred_path, obj_dir)
+                frames = sorted(os.listdir(obj_gt_path))
+                to_evaluate.append((frames, obj_dir, obj_gt_path, obj_pred_path))
+        return to_evaluate, is_sav_format
+
+    def consolidate(
+        self, eval_results
+    ) -> Tuple[str, Dict[str, float], Dict[str, float]]:
+        """
+        Consolidate the results of all the objects from the video into one dictionary.
+        """
+        iou_output = {}
+        boundary_f_output = {}
+        for obj_id, iou, boundary_f in eval_results:
+            assert len(iou) == 1
+            key = list(iou.keys())[0]
+            iou_output[obj_id] = iou[key]
+            boundary_f_output[obj_id] = boundary_f[key]
+        return iou_output, boundary_f_output
+
+
+#################################################################################################################
+# Functions below are from https://github.com/hkchengrex/vos-benchmark with minor modifications
+# _seg2bmap from https://github.com/hkchengrex/vos-benchmark/blob/main/vos_benchmark/utils.py
+# get_iou and Evaluator from https://github.com/hkchengrex/vos-benchmark/blob/main/vos_benchmark/evaluator.py
+# benchmark from https://github.com/hkchengrex/vos-benchmark/blob/main/vos_benchmark/benchmark.py with slight mod
+#################################################################################################################
+
+
+def _seg2bmap(seg, width=None, height=None):
+    """
+    From a segmentation, compute a binary boundary map with 1 pixel wide
+    boundaries.  The boundary pixels are offset by 1/2 pixel towards the
+    origin from the actual segment boundary.
+    Arguments:
+        seg     : Segments labeled from 1..k.
+        width	  :	Width of desired bmap  <= seg.shape[1]
+        height  :	Height of desired bmap <= seg.shape[0]
+    Returns:
+        bmap (ndarray):	Binary boundary map.
+     David Martin <dmartin@eecs.berkeley.edu>
+     January 2003
+    """
+
+    seg = seg.astype(bool)
+    seg[seg > 0] = 1
+
+    assert np.atleast_3d(seg).shape[2] == 1
+
+    width = seg.shape[1] if width is None else width
+    height = seg.shape[0] if height is None else height
+
+    h, w = seg.shape[:2]
+
+    ar1 = float(width) / float(height)
+    ar2 = float(w) / float(h)
+
+    assert not (
+        width > w | height > h | abs(ar1 - ar2) > 0.01
+    ), "Can" "t convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
+
+    e = np.zeros_like(seg)
+    s = np.zeros_like(seg)
+    se = np.zeros_like(seg)
+
+    e[:, :-1] = seg[:, 1:]
+    s[:-1, :] = seg[1:, :]
+    se[:-1, :-1] = seg[1:, 1:]
+
+    b = seg ^ e | seg ^ s | seg ^ se
+    b[-1, :] = seg[-1, :] ^ e[-1, :]
+    b[:, -1] = seg[:, -1] ^ s[:, -1]
+    b[-1, -1] = 0
+
+    if w == width and h == height:
+        bmap = b
+    else:
+        bmap = np.zeros((height, width))
+        for x in range(w):
+            for y in range(h):
+                if b[y, x]:
+                    j = 1 + math.floor((y - 1) + height / h)
+                    i = 1 + math.floor((x - 1) + width / h)
+                    bmap[j, i] = 1
+
+    return bmap
+
+
+def get_iou(intersection, pixel_sum):
+    # handle edge cases without resorting to epsilon
+    if intersection == pixel_sum:
+        # both mask and gt have zero pixels in them
+        assert intersection == 0
+        return 1
+
+    return intersection / (pixel_sum - intersection)
+
+
+class Evaluator:
+    def __init__(self, boundary=0.008, name=None, obj_id=None):
+        # boundary: used in computing boundary F-score
+        self.boundary = boundary
+        self.name = name
+        self.obj_id = obj_id
+        self.objects_in_gt = set()
+        self.objects_in_masks = set()
+
+        self.object_iou = defaultdict(list)
+        self.boundary_f = defaultdict(list)
+
+    def feed_frame(self, mask: np.ndarray, gt: np.ndarray):
+        """
+        Compute and accumulate metrics for a single frame (mask/gt pair)
+        """
+
+        # get all objects in the ground-truth
+        gt_objects = np.unique(gt)
+        gt_objects = gt_objects[gt_objects != 0].tolist()
+
+        # get all objects in the predicted mask
+        mask_objects = np.unique(mask)
+        mask_objects = mask_objects[mask_objects != 0].tolist()
+
+        self.objects_in_gt.update(set(gt_objects))
+        self.objects_in_masks.update(set(mask_objects))
+
+        all_objects = self.objects_in_gt.union(self.objects_in_masks)
+
+        # boundary disk for boundary F-score. It is the same for all objects.
+        bound_pix = np.ceil(self.boundary * np.linalg.norm(mask.shape))
+        boundary_disk = disk(bound_pix)
+
+        for obj_idx in all_objects:
+            obj_mask = mask == obj_idx
+            obj_gt = gt == obj_idx
+
+            # object iou
+            self.object_iou[obj_idx].append(
+                get_iou((obj_mask * obj_gt).sum(), obj_mask.sum() + obj_gt.sum())
+            )
+            """
+            # boundary f-score
+            This part is copied from davis2017-evaluation
+            """
+            mask_boundary = _seg2bmap(obj_mask)
+            gt_boundary = _seg2bmap(obj_gt)
+            mask_dilated = cv2.dilate(mask_boundary.astype(np.uint8), boundary_disk)
+            gt_dilated = cv2.dilate(gt_boundary.astype(np.uint8), boundary_disk)
+
+            # Get the intersection
+            gt_match = gt_boundary * mask_dilated
+            fg_match = mask_boundary * gt_dilated
+
+            # Area of the intersection
+            n_fg = np.sum(mask_boundary)
+            n_gt = np.sum(gt_boundary)
+
+            # Compute precision and recall
+            if n_fg == 0 and n_gt > 0:
+                precision = 1
+                recall = 0
+            elif n_fg > 0 and n_gt == 0:
+                precision = 0
+                recall = 1
+            elif n_fg == 0 and n_gt == 0:
+                precision = 1
+                recall = 1
+            else:
+                precision = np.sum(fg_match) / float(n_fg)
+                recall = np.sum(gt_match) / float(n_gt)
+
+            # Compute F measure
+            if precision + recall == 0:
+                F = 0
+            else:
+                F = 2 * precision * recall / (precision + recall)
+            self.boundary_f[obj_idx].append(F)
+
+    def conclude(self):
+        all_iou = {}
+        all_boundary_f = {}
+
+        for object_id in self.objects_in_gt:
+            all_iou[object_id] = np.mean(self.object_iou[object_id]) * 100
+            all_boundary_f[object_id] = np.mean(self.boundary_f[object_id]) * 100
+
+        return all_iou, all_boundary_f
+
+
+def benchmark(
+    gt_roots,
+    mask_roots,
+    strict=True,
+    num_processes=None,
+    *,
+    verbose=True,
+    skip_first_and_last=True,
+):
+    """
+    gt_roots: a list of paths to datasets, i.e., [path_to_DatasetA, path_to_DatasetB, ...]
+    mask_roots: same as above, but the .png are masks predicted by the model
+    strict: when True, all videos in the dataset must have corresponding predictions.
+            Setting it to False is useful in cases where the ground-truth contains both train/val
+                sets, but the model only predicts the val subset.
+            Either way, if a video is predicted (i.e., the corresponding folder exists),
+                then it must at least contain all the masks in the ground truth annotations.
+                Masks that are in the prediction but not in the ground-truth
+                (i.e., sparse annotations) are ignored.
+    skip_first_and_last: whether we should skip the first and the last frame in evaluation.
+                            This is used by DAVIS 2017 in their semi-supervised evaluation.
+                            It should be disabled for unsupervised evaluation.
+    """
+
+    assert len(gt_roots) == len(mask_roots)
+    single_dataset = len(gt_roots) == 1
+
+    if verbose:
+        if skip_first_and_last:
+            print(
+                "We are *SKIPPING* the evaluation of the first and the last frame (standard for semi-supervised video object segmentation)."
+            )
+        else:
+            print(
+                "We are *NOT SKIPPING* the evaluation of the first and the last frame (*NOT STANDARD* for semi-supervised video object segmentation)."
+            )
+
+    pool = Pool(num_processes)
+    start = time.time()
+    to_wait = []
+    for gt_root, mask_root in zip(gt_roots, mask_roots):
+        # Validate folders
+        validated = True
+        gt_videos = os.listdir(gt_root)
+        mask_videos = os.listdir(mask_root)
+
+        # if the user passed the root directory instead of Annotations
+        if len(gt_videos) != len(mask_videos):
+            if "Annotations" in gt_videos:
+                if ".png" not in os.listdir(path.join(gt_root, "Annotations"))[0]:
+                    gt_root = path.join(gt_root, "Annotations")
+                    gt_videos = os.listdir(gt_root)
+
+        # remove non-folder items
+        gt_videos = list(filter(lambda x: path.isdir(path.join(gt_root, x)), gt_videos))
+        mask_videos = list(
+            filter(lambda x: path.isdir(path.join(mask_root, x)), mask_videos)
+        )
+
+        if not strict:
+            videos = sorted(list(set(gt_videos) & set(mask_videos)))
+        else:
+            gt_extras = set(gt_videos) - set(mask_videos)
+            mask_extras = set(mask_videos) - set(gt_videos)
+
+            if len(gt_extras) > 0:
+                print(
+                    f"Videos that are in {gt_root} but not in {mask_root}: {gt_extras}"
+                )
+                validated = False
+            if len(mask_extras) > 0:
+                print(
+                    f"Videos that are in {mask_root} but not in {gt_root}: {mask_extras}"
+                )
+                validated = False
+            if not validated:
+                print("Validation failed. Exiting.")
+                exit(1)
+
+            videos = sorted(gt_videos)
+
+        if verbose:
+            print(
+                f"In dataset {gt_root}, we are evaluating on {len(videos)} videos: {videos}"
+            )
+
+        if single_dataset:
+            if verbose:
+                results = tqdm.tqdm(
+                    pool.imap(
+                        VideoEvaluator(
+                            gt_root, mask_root, skip_first_and_last=skip_first_and_last
+                        ),
+                        videos,
+                    ),
+                    total=len(videos),
+                )
+            else:
+                results = pool.map(
+                    VideoEvaluator(
+                        gt_root, mask_root, skip_first_and_last=skip_first_and_last
+                    ),
+                    videos,
+                )
+        else:
+            to_wait.append(
+                pool.map_async(
+                    VideoEvaluator(
+                        gt_root, mask_root, skip_first_and_last=skip_first_and_last
+                    ),
+                    videos,
+                )
+            )
+
+    pool.close()
+
+    all_global_jf, all_global_j, all_global_f = [], [], []
+    all_object_metrics = []
+    for i, mask_root in enumerate(mask_roots):
+        if not single_dataset:
+            results = to_wait[i].get()
+
+        all_iou = []
+        all_boundary_f = []
+        object_metrics = {}
+        for name, iou, boundary_f in results:
+            all_iou.extend(list(iou.values()))
+            all_boundary_f.extend(list(boundary_f.values()))
+            object_metrics[name] = (iou, boundary_f)
+
+        global_j = np.array(all_iou).mean()
+        global_f = np.array(all_boundary_f).mean()
+        global_jf = (global_j + global_f) / 2
+
+        time_taken = time.time() - start
+        """
+        Build string for reporting results
+        """
+        # find max length for padding
+        ml = max(*[len(n) for n in object_metrics.keys()], len("Global score"))
+        # build header
+        out_string = f'{"sequence":<{ml}},{"obj":>3}, {"J&F":>4}, {"J":>4}, {"F":>4}\n'
+        out_string += f'{"Global score":<{ml}},{"":>3}, {global_jf:.1f}, {global_j:.1f}, {global_f:.1f}\n'
+        # append one line for each object
+        for name, (iou, boundary_f) in object_metrics.items():
+            for object_idx in iou.keys():
+                j, f = iou[object_idx], boundary_f[object_idx]
+                jf = (j + f) / 2
+                out_string += (
+                    f"{name:<{ml}},{object_idx:03}, {jf:>4.1f}, {j:>4.1f}, {f:>4.1f}\n"
+                )
+
+        # print to console
+        if verbose:
+            print(out_string.replace(",", " "), end="")
+            print("\nSummary:")
+            print(
+                f"Global score: J&F: {global_jf:.1f} J: {global_j:.1f} F: {global_f:.1f}"
+            )
+            print(f"Time taken: {time_taken:.2f}s")
+
+        # print to file
+        result_path = path.join(mask_root, "results.csv")
+        print(f"Saving the results to {result_path}")
+        with open(result_path, "w") as f:
+            f.write(out_string)
+
+        all_global_jf.append(global_jf)
+        all_global_j.append(global_j)
+        all_global_f.append(global_f)
+        all_object_metrics.append(object_metrics)
+
+    return all_global_jf, all_global_j, all_global_f, all_object_metrics