support gsam2 image predictor model

2024-08-01 17:05:01 +08:00
parent 72501fecf8
commit 1dacb47840
333 changed files with 24764 additions and 0 deletions
--- a/sav_dataset/LICENSE
+++ b/sav_dataset/LICENSE
@@ -0,0 +1,30 @@
+BSD License
+
+For SAM 2 Eval software
+
+Copyright (c) Meta Platforms, Inc. and affiliates.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name Meta nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/sav_dataset/LICENSE_DAVIS
+++ b/sav_dataset/LICENSE_DAVIS
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2020, DAVIS: Densely Annotated VIdeo Segmentation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/sav_dataset/LICENSE_VOS_BENCHMARK
+++ b/sav_dataset/LICENSE_VOS_BENCHMARK
@@ -0,0 +1,7 @@
+Copyright 2023 Rex Cheng
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/sav_dataset/README.md
+++ b/sav_dataset/README.md
@@ -0,0 +1,164 @@
+# Segment Anything Video (SA-V) Dataset
+
+## Overview
+
+[Segment Anything Video (SA-V)](https://ai.meta.com/datasets/segment-anything-video/), consists of 51K diverse videos and 643K high-quality spatio-temporal segmentation masks (i.e., masklets). The dataset is released under the CC by 4.0 license. Browse the dataset [here](https://sam2.metademolab.com/dataset).
+
+![SA-V dataset](../assets/sa_v_dataset.jpg?raw=true)
+
+## Getting Started
+
+### Download the dataset
+
+Visit [here](https://ai.meta.com/datasets/segment-anything-video-downloads/) to download SA-V including the training, val and test sets.
+
+### Dataset Stats
+
+|            | Num Videos | Num Masklets                              |
+| ---------- | ---------- | ----------------------------------------- |
+| SA-V train | 50,583     | 642,036 (auto 451,720 and manual 190,316) |
+| SA-V val   | 155        | 293                                       |
+| SA-V test  | 150        | 278                                       |
+
+### Notebooks
+
+To load and visualize the SA-V training set annotations, refer to the example [sav_visualization_example.ipynb](./sav_visualization_example.ipynb) notebook.
+
+### SA-V train
+
+For SA-V training set we release the mp4 videos and store the masklet annotations per video as json files . Automatic masklets and manual masklets are stored separately as two json files: `{video_id}_auto.json` and `{video_id}_manual.json`. They can be loaded as dictionaries in python in the format below.
+
+```
+{
+    "video_id"                        : str; video id
+    "video_duration"                  : float64; the duration in seconds of this video
+    "video_frame_count"               : float64; the number of frames in the video
+    "video_height"                    : float64; the height of the video
+    "video_width"                     : float64; the width of the video
+    "video_resolution"                : float64; video_height $\times$ video_width
+    "video_environment"               : List[str]; "Indoor" or "Outdoor"
+    "video_split"                     : str; "train" for training set
+    "masklet"                         : List[List[Dict]]; masklet annotations in list of list of RLEs.
+                                        The outer list is over frames in the video and the inner list
+                                        is over objects in the video.
+    "masklet_id"                      : List[int]; the masklet ids
+    "masklet_size_rel"                : List[float]; the average mask area normalized by resolution
+                                        across all the frames where the object is visible
+    "masklet_size_abs"                : List[float]; the average mask area (in pixels)
+                                        across all the frames where the object is visible
+    "masklet_size_bucket"             : List[str]; "small": $1$ <= masklet_size_abs < $32^2$,
+                                        "medium": $32^2$ <= masklet_size_abs < $96^2$,
+                                        and "large": masklet_size_abs > $96^2$
+    "masklet_visibility_changes"      : List[int]; the number of times where the visibility changes
+                                        after the first appearance (e.g., invisible -> visible
+                                        or visible -> invisible)
+    "masklet_first_appeared_frame"    : List[int]; the index of the frame where the object appears
+                                        the first time in the video. Always 0 for auto masklets.
+    "masklet_frame_count"             : List[int]; the number of frames being annotated. Note that
+                                        videos are annotated at 6 fps (annotated every 4 frames)
+                                        while the videos are at 24 fps.
+    "masklet_edited_frame_count"      : List[int]; the number of frames being edited by human annotators.
+                                        Always 0 for auto masklets.
+    "masklet_type"                    : List[str]; "auto" or "manual"
+    "masklet_stability_score"         : Optional[List[List[float]]]; per-mask stability scores. Auto annotation only.
+    "masklet_num"                     : int; the number of manual/auto masklets in the video
+
+}
+```
+
+Note that in SA-V train, there are in total 50,583 videos where all of them have manual annotations. Among the 50,583 videos there are 48,436 videos that also have automatic annotations.
+
+### SA-V val and test
+
+For SA-V val and test sets, we release the extracted frames as jpeg files, and the masks as png files with the following directory structure:
+
+```
+sav_val(sav_test)
+├── sav_val.txt (sav_test.txt): a list of video ids in the split
+├── JPEGImages_24fps # videos are extracted at 24 fps
+│   ├── {video_id}
+│   │     ├── 00000.jpg        # video frame
+│   │     ├── 00001.jpg        # video frame
+│   │     ├── 00002.jpg        # video frame
+│   │     ├── 00003.jpg        # video frame
+│   │     └── ...
+│   ├── {video_id}
+│   ├── {video_id}
+│   └── ...
+└── Annotations_6fps # videos are annotated at 6 fps
+    ├── {video_id}
+    │     ├── 000               # obj 000
+    │     │    ├── 00000.png    # mask for object 000 in 00000.jpg
+    │     │    ├── 00004.png    # mask for object 000 in 00004.jpg
+    │     │    ├── 00008.png    # mask for object 000 in 00008.jpg
+    │     │    ├── 00012.png    # mask for object 000 in 00012.jpg
+    │     │    └── ...
+    │     ├── 001               # obj 001
+    │     ├── 002               # obj 002
+    │     └── ...
+    ├── {video_id}
+    ├── {video_id}
+    └── ...
+```
+
+All masklets in val and test sets are manually annotated in every frame by annotators. For each annotated object in a video, we store the annotated masks in a single png. This is because the annotated objects may overlap, e.g., it is possible in our SA-V dataset for there to be a mask for the whole person as well as a separate mask for their hands.
+
+## SA-V Val and Test Evaluation
+
+We provide an evaluator to compute the common J and F metrics on SA-V val and test sets. To run the evaluation, we need to first install a few dependencies as follows:
+
+```
+pip install -r requirements.txt
+```
+
+Then we can evaluate the predictions as follows:
+
+```
+python sav_evaluator.py --gt_root {GT_ROOT} --pred_root {PRED_ROOT}
+```
+
+or run
+
+```
+python sav_evaluator.py --help
+```
+
+to print a complete help message.
+
+The evaluator expects the `GT_ROOT` to be one of the following folder structures, and `GT_ROOT` and `PRED_ROOT` to have the same structure.
+
+- Same as SA-V val and test directory structure
+
+```
+{GT_ROOT}  # gt root folder
+├── {video_id}
+│     ├── 000               # all masks associated with obj 000
+│     │    ├── 00000.png    # mask for object 000 in frame 00000 (binary mask)
+│     │    └── ...
+│     ├── 001               # all masks associated with obj 001
+│     ├── 002               # all masks associated with obj 002
+│     └── ...
+├── {video_id}
+├── {video_id}
+└── ...
+```
+
+In the paper for the experiments on SA-V val and test, we run inference on the 24 fps videos, and evaluate on the subset of frames where we have ground truth annotations (first and last annotated frames dropped). The evaluator will ignore the masks in frames where we don't have ground truth annotations.
+
+- Same as [DAVIS](https://github.com/davisvideochallenge/davis2017-evaluation) directory structure
+
+```
+{GT_ROOT}  # gt root folder
+├── {video_id}
+│     ├── 00000.png        # annotations in frame 00000 (may contain multiple objects)
+│     └── ...
+├── {video_id}
+├── {video_id}
+└── ...
+```
+
+## License
+
+The evaluation code is licensed under the [BSD 3 license](./LICENSE). Please refer to the paper for more details on the models. The videos and annotations in SA-V Dataset are released under CC BY 4.0.
+
+Third-party code: the evaluation software is heavily adapted from [`VOS-Benchmark`](https://github.com/hkchengrex/vos-benchmark) and [`DAVIS`](https://github.com/davisvideochallenge/davis2017-evaluation) (with their licenses in [`LICENSE_DAVIS`](./LICENSE_DAVIS) and [`LICENSE_VOS_BENCHMARK`](./LICENSE_VOS_BENCHMARK)).
--- a/sav_dataset/example/sav_000001.mp4
+++ b/sav_dataset/example/sav_000001.mp4
--- a/sav_dataset/example/sav_000001_auto.json
+++ b/sav_dataset/example/sav_000001_auto.json
--- a/sav_dataset/example/sav_000001_manual.json
+++ b/sav_dataset/example/sav_000001_manual.json
--- a/sav_dataset/requirements.txt
+++ b/sav_dataset/requirements.txt
@@ -0,0 +1,7 @@
+pycocoevalcap
+scikit-image
+opencv-python
+tqdm
+pillow
+numpy
+matplotlib
--- a/sav_dataset/sav_evaluator.py
+++ b/sav_dataset/sav_evaluator.py
@@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the sav_dataset directory of this source tree.
+
+# adapted from https://github.com/hkchengrex/vos-benchmark
+# and  https://github.com/davisvideochallenge/davis2017-evaluation
+# with their licenses found in the LICENSE_VOS_BENCHMARK and LICENSE_DAVIS files
+# in the sav_dataset directory.
+from argparse import ArgumentParser
+
+from utils.sav_benchmark import benchmark
+
+"""
+The structure of the {GT_ROOT} can be either of the follow two structures. 
+{GT_ROOT} and {PRED_ROOT} should be of the same format
+
+1. SA-V val/test structure
+    {GT_ROOT}  # gt root folder
+        ├── {video_id}
+        │     ├── 000               # all masks associated with obj 000
+        │     │    ├── {frame_id}.png    # mask for object 000 in {frame_id} (binary mask)
+        │     │    └── ...
+        │     ├── 001               # all masks associated with obj 001
+        │     ├── 002               # all masks associated with obj 002
+        │     └── ...
+        ├── {video_id}
+        ├── {video_id}
+        └── ...
+
+2. Similar to DAVIS structure:
+
+    {GT_ROOT}  # gt root folder
+        ├── {video_id}
+        │     ├── {frame_id}.png          # annotation in {frame_id} (may contain multiple objects)
+        │     └── ...
+        ├── {video_id}
+        ├── {video_id}
+        └── ...
+"""
+
+
+parser = ArgumentParser()
+parser.add_argument(
+    "--gt_root",
+    required=True,
+    help="Path to the GT folder. For SA-V, it's sav_val/Annotations_6fps or sav_test/Annotations_6fps",
+)
+parser.add_argument(
+    "--pred_root",
+    required=True,
+    help="Path to a folder containing folders of masks to be evaluated, with exactly the same structure as gt_root",
+)
+parser.add_argument(
+    "-n", "--num_processes", default=16, type=int, help="Number of concurrent processes"
+)
+parser.add_argument(
+    "-s",
+    "--strict",
+    help="Make sure every video in the gt_root folder has a corresponding video in the prediction",
+    action="store_true",
+)
+parser.add_argument(
+    "-q",
+    "--quiet",
+    help="Quietly run evaluation without printing the information out",
+    action="store_true",
+)
+
+# https://github.com/davisvideochallenge/davis2017-evaluation/blob/d34fdef71ce3cb24c1a167d860b707e575b3034c/davis2017/evaluation.py#L85
+parser.add_argument(
+    "--do_not_skip_first_and_last_frame",
+    help="In SA-V val and test, we skip the first and the last annotated frames in evaluation. "
+    "Set this to true for evaluation on settings that doen't skip first and last frames",
+    action="store_true",
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    benchmark(
+        [args.gt_root],
+        [args.pred_root],
+        args.strict,
+        args.num_processes,
+        verbose=not args.quiet,
+        skip_first_and_last=not args.do_not_skip_first_and_last_frame,
+    )
--- a/sav_dataset/sav_visualization_example.ipynb
+++ b/sav_dataset/sav_visualization_example.ipynb
--- a/sav_dataset/utils/sav_benchmark.py
+++ b/sav_dataset/utils/sav_benchmark.py
@@ -0,0 +1,488 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the sav_dataset directory of this source tree.
+
+# adapted from https://github.com/hkchengrex/vos-benchmark
+# and  https://github.com/davisvideochallenge/davis2017-evaluation
+# with their licenses found in the LICENSE_VOS_BENCHMARK and LICENSE_DAVIS files
+# in the sav_dataset directory.
+import math
+import os
+import time
+from collections import defaultdict
+from multiprocessing import Pool
+from os import path
+from typing import Dict, List, Tuple
+
+import cv2
+import numpy as np
+import tqdm
+from PIL import Image
+from skimage.morphology import disk
+
+
+class VideoEvaluator:
+    def __init__(self, gt_root, pred_root, skip_first_and_last=True) -> None:
+        """
+        gt_root: path to the folder storing the gt masks
+        pred_root: path to the folder storing the predicted masks
+        skip_first_and_last: whether we should skip the evaluation of the first and the last frame.
+                             True for SA-V val and test, same as in DAVIS semi-supervised evaluation.
+        """
+        self.gt_root = gt_root
+        self.pred_root = pred_root
+        self.skip_first_and_last = skip_first_and_last
+
+    def __call__(self, vid_name: str) -> Tuple[str, Dict[str, float], Dict[str, float]]:
+        """
+        vid_name: name of the video to evaluate
+        """
+
+        # scan the folder to find subfolders for evaluation and
+        # check if the folder structure is SA-V
+        to_evaluate, is_sav_format = self.scan_vid_folder(vid_name)
+
+        # evaluate each (gt_path, pred_path) pair
+        eval_results = []
+        for all_frames, obj_id, gt_path, pred_path in to_evaluate:
+            if self.skip_first_and_last:
+                # skip the first and the last frames
+                all_frames = all_frames[1:-1]
+
+            evaluator = Evaluator(name=vid_name, obj_id=obj_id)
+            for frame in all_frames:
+                gt_array, pred_array = self.get_gt_and_pred(
+                    gt_path, pred_path, frame, is_sav_format
+                )
+                evaluator.feed_frame(mask=pred_array, gt=gt_array)
+
+            iou, boundary_f = evaluator.conclude()
+            eval_results.append((obj_id, iou, boundary_f))
+
+        if is_sav_format:
+            iou_output, boundary_f_output = self.consolidate(eval_results)
+        else:
+            assert len(eval_results) == 1
+            iou_output = eval_results[0][1]
+            boundary_f_output = eval_results[0][2]
+
+        return vid_name, iou_output, boundary_f_output
+
+    def get_gt_and_pred(
+        self,
+        gt_path: str,
+        pred_path: str,
+        f_name: str,
+        is_sav_format: bool,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Get the ground-truth and predicted masks for a single frame.
+        """
+        gt_mask_path = path.join(gt_path, f_name)
+        pred_mask_path = path.join(pred_path, f_name)
+        assert os.path.exists(pred_mask_path), f"{pred_mask_path} not found"
+
+        gt_array = np.array(Image.open(gt_mask_path))
+        pred_array = np.array(Image.open(pred_mask_path))
+        assert (
+            gt_array.shape[-2:] == pred_array.shape[-2:]
+        ), f"shape mismatch: {gt_mask_path}, {pred_mask_path}"
+
+        if is_sav_format:
+            assert len(np.unique(gt_array)) <= 2, (
+                f"found more than 1 object in {gt_mask_path} "
+                "SA-V format assumes one object mask per png file."
+            )
+            assert len(np.unique(pred_array)) <= 2, (
+                f"found more than 1 object in {pred_mask_path} "
+                "SA-V format assumes one object mask per png file."
+            )
+            gt_array = gt_array > 0
+            pred_array = pred_array > 0
+
+        return gt_array, pred_array
+
+    def scan_vid_folder(self, vid_name) -> Tuple[List, bool]:
+        """
+        Scan the folder structure of the video and return a list of folders for evaluate.
+        """
+
+        vid_gt_path = path.join(self.gt_root, vid_name)
+        vid_pred_path = path.join(self.pred_root, vid_name)
+        all_files_and_dirs = sorted(os.listdir(vid_gt_path))
+        to_evaluate = []
+        if all(name.endswith(".png") for name in all_files_and_dirs):
+            # All files are png files, dataset structure similar to DAVIS
+            is_sav_format = False
+            frames = all_files_and_dirs
+            obj_dir = None
+            to_evaluate.append((frames, obj_dir, vid_gt_path, vid_pred_path))
+        else:
+            # SA-V dataset structure, going one layer down into each subdirectory
+            is_sav_format = True
+            for obj_dir in all_files_and_dirs:
+                obj_gt_path = path.join(vid_gt_path, obj_dir)
+                obj_pred_path = path.join(vid_pred_path, obj_dir)
+                frames = sorted(os.listdir(obj_gt_path))
+                to_evaluate.append((frames, obj_dir, obj_gt_path, obj_pred_path))
+        return to_evaluate, is_sav_format
+
+    def consolidate(
+        self, eval_results
+    ) -> Tuple[str, Dict[str, float], Dict[str, float]]:
+        """
+        Consolidate the results of all the objects from the video into one dictionary.
+        """
+        iou_output = {}
+        boundary_f_output = {}
+        for obj_id, iou, boundary_f in eval_results:
+            assert len(iou) == 1
+            key = list(iou.keys())[0]
+            iou_output[obj_id] = iou[key]
+            boundary_f_output[obj_id] = boundary_f[key]
+        return iou_output, boundary_f_output
+
+
+#################################################################################################################
+# Functions below are from https://github.com/hkchengrex/vos-benchmark with minor modifications
+# _seg2bmap from https://github.com/hkchengrex/vos-benchmark/blob/main/vos_benchmark/utils.py
+# get_iou and Evaluator from https://github.com/hkchengrex/vos-benchmark/blob/main/vos_benchmark/evaluator.py
+# benchmark from https://github.com/hkchengrex/vos-benchmark/blob/main/vos_benchmark/benchmark.py with slight mod
+#################################################################################################################
+
+
+def _seg2bmap(seg, width=None, height=None):
+    """
+    From a segmentation, compute a binary boundary map with 1 pixel wide
+    boundaries.  The boundary pixels are offset by 1/2 pixel towards the
+    origin from the actual segment boundary.
+    Arguments:
+        seg     : Segments labeled from 1..k.
+        width	  :	Width of desired bmap  <= seg.shape[1]
+        height  :	Height of desired bmap <= seg.shape[0]
+    Returns:
+        bmap (ndarray):	Binary boundary map.
+     David Martin <dmartin@eecs.berkeley.edu>
+     January 2003
+    """
+
+    seg = seg.astype(bool)
+    seg[seg > 0] = 1
+
+    assert np.atleast_3d(seg).shape[2] == 1
+
+    width = seg.shape[1] if width is None else width
+    height = seg.shape[0] if height is None else height
+
+    h, w = seg.shape[:2]
+
+    ar1 = float(width) / float(height)
+    ar2 = float(w) / float(h)
+
+    assert not (
+        width > w | height > h | abs(ar1 - ar2) > 0.01
+    ), "Can" "t convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
+
+    e = np.zeros_like(seg)
+    s = np.zeros_like(seg)
+    se = np.zeros_like(seg)
+
+    e[:, :-1] = seg[:, 1:]
+    s[:-1, :] = seg[1:, :]
+    se[:-1, :-1] = seg[1:, 1:]
+
+    b = seg ^ e | seg ^ s | seg ^ se
+    b[-1, :] = seg[-1, :] ^ e[-1, :]
+    b[:, -1] = seg[:, -1] ^ s[:, -1]
+    b[-1, -1] = 0
+
+    if w == width and h == height:
+        bmap = b
+    else:
+        bmap = np.zeros((height, width))
+        for x in range(w):
+            for y in range(h):
+                if b[y, x]:
+                    j = 1 + math.floor((y - 1) + height / h)
+                    i = 1 + math.floor((x - 1) + width / h)
+                    bmap[j, i] = 1
+
+    return bmap
+
+
+def get_iou(intersection, pixel_sum):
+    # handle edge cases without resorting to epsilon
+    if intersection == pixel_sum:
+        # both mask and gt have zero pixels in them
+        assert intersection == 0
+        return 1
+
+    return intersection / (pixel_sum - intersection)
+
+
+class Evaluator:
+    def __init__(self, boundary=0.008, name=None, obj_id=None):
+        # boundary: used in computing boundary F-score
+        self.boundary = boundary
+        self.name = name
+        self.obj_id = obj_id
+        self.objects_in_gt = set()
+        self.objects_in_masks = set()
+
+        self.object_iou = defaultdict(list)
+        self.boundary_f = defaultdict(list)
+
+    def feed_frame(self, mask: np.ndarray, gt: np.ndarray):
+        """
+        Compute and accumulate metrics for a single frame (mask/gt pair)
+        """
+
+        # get all objects in the ground-truth
+        gt_objects = np.unique(gt)
+        gt_objects = gt_objects[gt_objects != 0].tolist()
+
+        # get all objects in the predicted mask
+        mask_objects = np.unique(mask)
+        mask_objects = mask_objects[mask_objects != 0].tolist()
+
+        self.objects_in_gt.update(set(gt_objects))
+        self.objects_in_masks.update(set(mask_objects))
+
+        all_objects = self.objects_in_gt.union(self.objects_in_masks)
+
+        # boundary disk for boundary F-score. It is the same for all objects.
+        bound_pix = np.ceil(self.boundary * np.linalg.norm(mask.shape))
+        boundary_disk = disk(bound_pix)
+
+        for obj_idx in all_objects:
+            obj_mask = mask == obj_idx
+            obj_gt = gt == obj_idx
+
+            # object iou
+            self.object_iou[obj_idx].append(
+                get_iou((obj_mask * obj_gt).sum(), obj_mask.sum() + obj_gt.sum())
+            )
+            """
+            # boundary f-score
+            This part is copied from davis2017-evaluation
+            """
+            mask_boundary = _seg2bmap(obj_mask)
+            gt_boundary = _seg2bmap(obj_gt)
+            mask_dilated = cv2.dilate(mask_boundary.astype(np.uint8), boundary_disk)
+            gt_dilated = cv2.dilate(gt_boundary.astype(np.uint8), boundary_disk)
+
+            # Get the intersection
+            gt_match = gt_boundary * mask_dilated
+            fg_match = mask_boundary * gt_dilated
+
+            # Area of the intersection
+            n_fg = np.sum(mask_boundary)
+            n_gt = np.sum(gt_boundary)
+
+            # Compute precision and recall
+            if n_fg == 0 and n_gt > 0:
+                precision = 1
+                recall = 0
+            elif n_fg > 0 and n_gt == 0:
+                precision = 0
+                recall = 1
+            elif n_fg == 0 and n_gt == 0:
+                precision = 1
+                recall = 1
+            else:
+                precision = np.sum(fg_match) / float(n_fg)
+                recall = np.sum(gt_match) / float(n_gt)
+
+            # Compute F measure
+            if precision + recall == 0:
+                F = 0
+            else:
+                F = 2 * precision * recall / (precision + recall)
+            self.boundary_f[obj_idx].append(F)
+
+    def conclude(self):
+        all_iou = {}
+        all_boundary_f = {}
+
+        for object_id in self.objects_in_gt:
+            all_iou[object_id] = np.mean(self.object_iou[object_id]) * 100
+            all_boundary_f[object_id] = np.mean(self.boundary_f[object_id]) * 100
+
+        return all_iou, all_boundary_f
+
+
+def benchmark(
+    gt_roots,
+    mask_roots,
+    strict=True,
+    num_processes=None,
+    *,
+    verbose=True,
+    skip_first_and_last=True,
+):
+    """
+    gt_roots: a list of paths to datasets, i.e., [path_to_DatasetA, path_to_DatasetB, ...]
+    mask_roots: same as above, but the .png are masks predicted by the model
+    strict: when True, all videos in the dataset must have corresponding predictions.
+            Setting it to False is useful in cases where the ground-truth contains both train/val
+                sets, but the model only predicts the val subset.
+            Either way, if a video is predicted (i.e., the corresponding folder exists),
+                then it must at least contain all the masks in the ground truth annotations.
+                Masks that are in the prediction but not in the ground-truth
+                (i.e., sparse annotations) are ignored.
+    skip_first_and_last: whether we should skip the first and the last frame in evaluation.
+                            This is used by DAVIS 2017 in their semi-supervised evaluation.
+                            It should be disabled for unsupervised evaluation.
+    """
+
+    assert len(gt_roots) == len(mask_roots)
+    single_dataset = len(gt_roots) == 1
+
+    if verbose:
+        if skip_first_and_last:
+            print(
+                "We are *SKIPPING* the evaluation of the first and the last frame (standard for semi-supervised video object segmentation)."
+            )
+        else:
+            print(
+                "We are *NOT SKIPPING* the evaluation of the first and the last frame (*NOT STANDARD* for semi-supervised video object segmentation)."
+            )
+
+    pool = Pool(num_processes)
+    start = time.time()
+    to_wait = []
+    for gt_root, mask_root in zip(gt_roots, mask_roots):
+        # Validate folders
+        validated = True
+        gt_videos = os.listdir(gt_root)
+        mask_videos = os.listdir(mask_root)
+
+        # if the user passed the root directory instead of Annotations
+        if len(gt_videos) != len(mask_videos):
+            if "Annotations" in gt_videos:
+                if ".png" not in os.listdir(path.join(gt_root, "Annotations"))[0]:
+                    gt_root = path.join(gt_root, "Annotations")
+                    gt_videos = os.listdir(gt_root)
+
+        # remove non-folder items
+        gt_videos = list(filter(lambda x: path.isdir(path.join(gt_root, x)), gt_videos))
+        mask_videos = list(
+            filter(lambda x: path.isdir(path.join(mask_root, x)), mask_videos)
+        )
+
+        if not strict:
+            videos = sorted(list(set(gt_videos) & set(mask_videos)))
+        else:
+            gt_extras = set(gt_videos) - set(mask_videos)
+            mask_extras = set(mask_videos) - set(gt_videos)
+
+            if len(gt_extras) > 0:
+                print(
+                    f"Videos that are in {gt_root} but not in {mask_root}: {gt_extras}"
+                )
+                validated = False
+            if len(mask_extras) > 0:
+                print(
+                    f"Videos that are in {mask_root} but not in {gt_root}: {mask_extras}"
+                )
+                validated = False
+            if not validated:
+                print("Validation failed. Exiting.")
+                exit(1)
+
+            videos = sorted(gt_videos)
+
+        if verbose:
+            print(
+                f"In dataset {gt_root}, we are evaluating on {len(videos)} videos: {videos}"
+            )
+
+        if single_dataset:
+            if verbose:
+                results = tqdm.tqdm(
+                    pool.imap(
+                        VideoEvaluator(
+                            gt_root, mask_root, skip_first_and_last=skip_first_and_last
+                        ),
+                        videos,
+                    ),
+                    total=len(videos),
+                )
+            else:
+                results = pool.map(
+                    VideoEvaluator(
+                        gt_root, mask_root, skip_first_and_last=skip_first_and_last
+                    ),
+                    videos,
+                )
+        else:
+            to_wait.append(
+                pool.map_async(
+                    VideoEvaluator(
+                        gt_root, mask_root, skip_first_and_last=skip_first_and_last
+                    ),
+                    videos,
+                )
+            )
+
+    pool.close()
+
+    all_global_jf, all_global_j, all_global_f = [], [], []
+    all_object_metrics = []
+    for i, mask_root in enumerate(mask_roots):
+        if not single_dataset:
+            results = to_wait[i].get()
+
+        all_iou = []
+        all_boundary_f = []
+        object_metrics = {}
+        for name, iou, boundary_f in results:
+            all_iou.extend(list(iou.values()))
+            all_boundary_f.extend(list(boundary_f.values()))
+            object_metrics[name] = (iou, boundary_f)
+
+        global_j = np.array(all_iou).mean()
+        global_f = np.array(all_boundary_f).mean()
+        global_jf = (global_j + global_f) / 2
+
+        time_taken = time.time() - start
+        """
+        Build string for reporting results
+        """
+        # find max length for padding
+        ml = max(*[len(n) for n in object_metrics.keys()], len("Global score"))
+        # build header
+        out_string = f'{"sequence":<{ml}},{"obj":>3}, {"J&F":>4}, {"J":>4}, {"F":>4}\n'
+        out_string += f'{"Global score":<{ml}},{"":>3}, {global_jf:.1f}, {global_j:.1f}, {global_f:.1f}\n'
+        # append one line for each object
+        for name, (iou, boundary_f) in object_metrics.items():
+            for object_idx in iou.keys():
+                j, f = iou[object_idx], boundary_f[object_idx]
+                jf = (j + f) / 2
+                out_string += (
+                    f"{name:<{ml}},{object_idx:03}, {jf:>4.1f}, {j:>4.1f}, {f:>4.1f}\n"
+                )
+
+        # print to console
+        if verbose:
+            print(out_string.replace(",", " "), end="")
+            print("\nSummary:")
+            print(
+                f"Global score: J&F: {global_jf:.1f} J: {global_j:.1f} F: {global_f:.1f}"
+            )
+            print(f"Time taken: {time_taken:.2f}s")
+
+        # print to file
+        result_path = path.join(mask_root, "results.csv")
+        print(f"Saving the results to {result_path}")
+        with open(result_path, "w") as f:
+            f.write(out_string)
+
+        all_global_jf.append(global_jf)
+        all_global_j.append(global_j)
+        all_global_f.append(global_f)
+        all_object_metrics.append(object_metrics)
+
+    return all_global_jf, all_global_j, all_global_f, all_object_metrics
--- a/sav_dataset/utils/sav_utils.py
+++ b/sav_dataset/utils/sav_utils.py
@@ -0,0 +1,175 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the sav_dataset directory of this source tree.
+import json
+import os
+from typing import Dict, List, Optional, Tuple
+
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import pycocotools.mask as mask_util
+
+
+def decode_video(video_path: str) -> List[np.ndarray]:
+    """
+    Decode the video and return the RGB frames
+    """
+    video = cv2.VideoCapture(video_path)
+    video_frames = []
+    while video.isOpened():
+        ret, frame = video.read()
+        if ret:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            video_frames.append(frame)
+        else:
+            break
+    return video_frames
+
+
+def show_anns(masks, colors: List, borders=True) -> None:
+    """
+    show the annotations
+    """
+    # return if no masks
+    if len(masks) == 0:
+        return
+
+    # sort masks by size
+    sorted_annot_and_color = sorted(
+        zip(masks, colors), key=(lambda x: x[0].sum()), reverse=True
+    )
+    H, W = sorted_annot_and_color[0][0].shape[0], sorted_annot_and_color[0][0].shape[1]
+
+    canvas = np.ones((H, W, 4))
+    canvas[:, :, 3] = 0  # set the alpha channel
+    contour_thickness = max(1, int(min(5, 0.01 * min(H, W))))
+    for mask, color in sorted_annot_and_color:
+        canvas[mask] = np.concatenate([color, [0.55]])
+        if borders:
+            contours, _ = cv2.findContours(
+                np.array(mask, dtype=np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE
+            )
+            cv2.drawContours(
+                canvas, contours, -1, (0.05, 0.05, 0.05, 1), thickness=contour_thickness
+            )
+
+    ax = plt.gca()
+    ax.imshow(canvas)
+
+
+class SAVDataset:
+    """
+    SAVDataset is a class to load the SAV dataset and visualize the annotations.
+    """
+
+    def __init__(self, sav_dir, annot_sample_rate=4):
+        """
+        Args:
+            sav_dir: the directory of the SAV dataset
+            annot_sample_rate: the sampling rate of the annotations.
+                The annotations are aligned with the videos at 6 fps.
+        """
+        self.sav_dir = sav_dir
+        self.annot_sample_rate = annot_sample_rate
+        self.manual_mask_colors = np.random.random((256, 3))
+        self.auto_mask_colors = np.random.random((256, 3))
+
+    def read_frames(self, mp4_path: str) -> None:
+        """
+        Read the frames and downsample them to align with the annotations.
+        """
+        if not os.path.exists(mp4_path):
+            print(f"{mp4_path} doesn't exist.")
+            return None
+        else:
+            # decode the video
+            frames = decode_video(mp4_path)
+            print(f"There are {len(frames)} frames decoded from {mp4_path} (24fps).")
+
+            # downsample the frames to align with the annotations
+            frames = frames[:: self.annot_sample_rate]
+            print(
+                f"Videos are annotated every {self.annot_sample_rate} frames. "
+                "To align with the annotations, "
+                f"downsample the video to {len(frames)} frames."
+            )
+            return frames
+
+    def get_frames_and_annotations(
+        self, video_id: str
+    ) -> Tuple[List | None, Dict | None, Dict | None]:
+        """
+        Get the frames and annotations for video.
+        """
+        # load the video
+        mp4_path = os.path.join(self.sav_dir, video_id + ".mp4")
+        frames = self.read_frames(mp4_path)
+        if frames is None:
+            return None, None, None
+
+        # load the manual annotations
+        manual_annot_path = os.path.join(self.sav_dir, video_id + "_manual.json")
+        if not os.path.exists(manual_annot_path):
+            print(f"{manual_annot_path} doesn't exist. Something might be wrong.")
+            manual_annot = None
+        else:
+            manual_annot = json.load(open(manual_annot_path))
+
+        # load the manual annotations
+        auto_annot_path = os.path.join(self.sav_dir, video_id + "_auto.json")
+        if not os.path.exists(auto_annot_path):
+            print(f"{auto_annot_path} doesn't exist.")
+            auto_annot = None
+        else:
+            auto_annot = json.load(open(auto_annot_path))
+
+        return frames, manual_annot, auto_annot
+
+    def visualize_annotation(
+        self,
+        frames: List[np.ndarray],
+        auto_annot: Optional[Dict],
+        manual_annot: Optional[Dict],
+        annotated_frame_id: int,
+        show_auto=True,
+        show_manual=True,
+    ) -> None:
+        """
+        Visualize the annotations on the annotated_frame_id.
+        If show_manual is True, show the manual annotations.
+        If show_auto is True, show the auto annotations.
+        By default, show both auto and manual annotations.
+        """
+
+        if annotated_frame_id >= len(frames):
+            print("invalid annotated_frame_id")
+            return
+
+        rles = []
+        colors = []
+        if show_manual and manual_annot is not None:
+            rles.extend(manual_annot["masklet"][annotated_frame_id])
+            colors.extend(
+                self.manual_mask_colors[
+                    : len(manual_annot["masklet"][annotated_frame_id])
+                ]
+            )
+        if show_auto and auto_annot is not None:
+            rles.extend(auto_annot["masklet"][annotated_frame_id])
+            colors.extend(
+                self.auto_mask_colors[: len(auto_annot["masklet"][annotated_frame_id])]
+            )
+
+        plt.imshow(frames[annotated_frame_id])
+
+        if len(rles) > 0:
+            masks = [mask_util.decode(rle) > 0 for rle in rles]
+            show_anns(masks, colors)
+        else:
+            print("No annotation will be shown")
+
+        plt.axis("off")
+        plt.show()