Grounded-SAM-2/sam2/build_sam.py

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import logging
import os

import torch
from hydra import compose
from hydra.utils import instantiate
from omegaconf import OmegaConf

import sam2

# Check if the user is running Python from the parent directory of the sam2 repo
# (i.e. the directory where this repo is cloned into) -- this is not supported since
# it could shadow the sam2 package and cause issues.
if os.path.isdir(os.path.join(sam2.__path__[0], "sam2")):
    # If the user has "sam2/sam2" in their path, they are likey importing the repo itself
    # as "sam2" rather than importing the "sam2" python package (i.e. "sam2/sam2" directory).
    # This typically happens because the user is running Python from the parent directory
    # that contains the sam2 repo they cloned.
    raise RuntimeError(
        "You're likely running Python from the parent directory of the sam2 repository "
        "(i.e. the directory where https://github.com/facebookresearch/sam2 is cloned into). "
        "This is not supported since the `sam2` Python package could be shadowed by the "
        "repository name (the repository is also named `sam2` and contains the Python package "
        "in `sam2/sam2`). Please run Python from another directory (e.g. from the repo dir "
        "rather than its parent dir, or from your home directory) after installing SAM 2."
    )


HF_MODEL_ID_TO_FILENAMES = {
    "facebook/sam2-hiera-tiny": (
        "configs/sam2/sam2_hiera_t.yaml",
        "sam2_hiera_tiny.pt",
    ),
    "facebook/sam2-hiera-small": (
        "configs/sam2/sam2_hiera_s.yaml",
        "sam2_hiera_small.pt",
    ),
    "facebook/sam2-hiera-base-plus": (
        "configs/sam2/sam2_hiera_b+.yaml",
        "sam2_hiera_base_plus.pt",
    ),
    "facebook/sam2-hiera-large": (
        "configs/sam2/sam2_hiera_l.yaml",
        "sam2_hiera_large.pt",
    ),
    "facebook/sam2.1-hiera-tiny": (
        "configs/sam2.1/sam2.1_hiera_t.yaml",
        "sam2.1_hiera_tiny.pt",
    ),
    "facebook/sam2.1-hiera-small": (
        "configs/sam2.1/sam2.1_hiera_s.yaml",
        "sam2.1_hiera_small.pt",
    ),
    "facebook/sam2.1-hiera-base-plus": (
        "configs/sam2.1/sam2.1_hiera_b+.yaml",
        "sam2.1_hiera_base_plus.pt",
    ),
    "facebook/sam2.1-hiera-large": (
        "configs/sam2.1/sam2.1_hiera_l.yaml",
        "sam2.1_hiera_large.pt",
    ),
}


def build_sam2(
    config_file,
    ckpt_path=None,
    device="cuda",
    mode="eval",
    hydra_overrides_extra=[],
    apply_postprocessing=True,
    **kwargs,
):

    if apply_postprocessing:
        hydra_overrides_extra = hydra_overrides_extra.copy()
        hydra_overrides_extra += [
            # dynamically fall back to multi-mask if the single mask is not stable
            "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
            "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
            "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
        ]
    # Read config and init model
    cfg = compose(config_name=config_file, overrides=hydra_overrides_extra)
    OmegaConf.resolve(cfg)
    model = instantiate(cfg.model, _recursive_=True)
    _load_checkpoint(model, ckpt_path)
    model = model.to(device)
    if mode == "eval":
        model.eval()
    return model


def build_sam2_video_predictor(
    config_file,
    ckpt_path=None,
    device="cuda",
    mode="eval",
    hydra_overrides_extra=[],
    apply_postprocessing=True,
    vos_optimized=False,
    **kwargs,
):
    hydra_overrides = [
        "++model._target_=sam2.sam2_video_predictor.SAM2VideoPredictor",
    ]
    if vos_optimized:
        hydra_overrides = [
            "++model._target_=sam2.sam2_video_predictor.SAM2VideoPredictorVOS",
            "++model.compile_image_encoder=True",  # Let sam2_base handle this
        ]

    if apply_postprocessing:
        hydra_overrides_extra = hydra_overrides_extra.copy()
        hydra_overrides_extra += [
            # dynamically fall back to multi-mask if the single mask is not stable
            "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
            "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
            "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
            # the sigmoid mask logits on interacted frames with clicks in the memory encoder so that the encoded masks are exactly as what users see from clicking
            "++model.binarize_mask_from_pts_for_mem_enc=true",
            # fill small holes in the low-res masks up to `fill_hole_area` (before resizing them to the original video resolution)
            "++model.fill_hole_area=8",
        ]
    hydra_overrides.extend(hydra_overrides_extra)

    # Read config and init model
    cfg = compose(config_name=config_file, overrides=hydra_overrides)
    OmegaConf.resolve(cfg)
    model = instantiate(cfg.model, _recursive_=True)
    _load_checkpoint(model, ckpt_path)
    model = model.to(device)
    if mode == "eval":
        model.eval()
    return model


def _hf_download(model_id):
    from huggingface_hub import hf_hub_download

    config_name, checkpoint_name = HF_MODEL_ID_TO_FILENAMES[model_id]
    ckpt_path = hf_hub_download(repo_id=model_id, filename=checkpoint_name)
    return config_name, ckpt_path


def build_sam2_hf(model_id, **kwargs):
    config_name, ckpt_path = _hf_download(model_id)
    return build_sam2(config_file=config_name, ckpt_path=ckpt_path, **kwargs)


def build_sam2_video_predictor_hf(model_id, **kwargs):
    config_name, ckpt_path = _hf_download(model_id)
    return build_sam2_video_predictor(
        config_file=config_name, ckpt_path=ckpt_path, **kwargs
    )


def _load_checkpoint(model, ckpt_path):
    if ckpt_path is not None:
        sd = torch.load(ckpt_path, map_location="cpu", weights_only=True)["model"]
        missing_keys, unexpected_keys = model.load_state_dict(sd)
        if missing_keys:
            logging.error(missing_keys)
            raise RuntimeError()
        if unexpected_keys:
            logging.error(unexpected_keys)
            raise RuntimeError()
        logging.info("Loaded checkpoint sucessfully")
Initial commit 2024-07-29 21:54:20 +00:00			`# Copyright (c) Meta Platforms, Inc. and affiliates.`
			`# All rights reserved.`

			`# This source code is licensed under the license found in the`
			`# LICENSE file in the root directory of this source tree.`

			`import logging`
[doc] Check and raise an error if the user is running Python from the parent directory of the sam2 repo (#359) If the user has "sam2/sam2" in their path, they are likey importing the repo itself as "sam2" rather than importing the "sam2" python package (i.e. "sam2/sam2" directory). This typically happens because the user is running Python from the parent directory that contains the sam2 repo they cloned. In general, the user should not run Python from the parent dir when the repo is cloned into (same is true for e.g. Numpy repo that contains names like `numpy/numpy` where the module and the repo have the same name), as the user encountered in https://github.com/facebookresearch/sam2/issues/346. (close https://github.com/facebookresearch/sam2/issues/346) 2024-10-05 00:34:06 -07:00			`import os`
Initial commit 2024-07-29 21:54:20 +00:00
			`import torch`
			`from hydra import compose`
			`from hydra.utils import instantiate`
			`from omegaconf import OmegaConf`

[doc] Check and raise an error if the user is running Python from the parent directory of the sam2 repo (#359) If the user has "sam2/sam2" in their path, they are likey importing the repo itself as "sam2" rather than importing the "sam2" python package (i.e. "sam2/sam2" directory). This typically happens because the user is running Python from the parent directory that contains the sam2 repo they cloned. In general, the user should not run Python from the parent dir when the repo is cloned into (same is true for e.g. Numpy repo that contains names like `numpy/numpy` where the module and the repo have the same name), as the user encountered in https://github.com/facebookresearch/sam2/issues/346. (close https://github.com/facebookresearch/sam2/issues/346) 2024-10-05 00:34:06 -07:00			`import sam2`

			`# Check if the user is running Python from the parent directory of the sam2 repo`
			`# (i.e. the directory where this repo is cloned into) -- this is not supported since`
			`# it could shadow the sam2 package and cause issues.`
			`if os.path.isdir(os.path.join(sam2.__path__[0], "sam2")):`
			`# If the user has "sam2/sam2" in their path, they are likey importing the repo itself`
			`# as "sam2" rather than importing the "sam2" python package (i.e. "sam2/sam2" directory).`
			`# This typically happens because the user is running Python from the parent directory`
			`# that contains the sam2 repo they cloned.`
			`raise RuntimeError(`
			`"You're likely running Python from the parent directory of the sam2 repository "`
			`"(i.e. the directory where https://github.com/facebookresearch/sam2 is cloned into). "`
			"This is not supported since the `sam2` Python package could be shadowed by the "
			"repository name (the repository is also named `sam2` and contains the Python package "
			"in `sam2/sam2`). Please run Python from another directory (e.g. from the repo dir "
			`"rather than its parent dir, or from your home directory) after installing SAM 2."`
			`)`


SAM2.1 SAM2.1 checkpoints + training code + Demo 2024-09-28 08:20:56 -07:00			`HF_MODEL_ID_TO_FILENAMES = {`
			`"facebook/sam2-hiera-tiny": (`
			`"configs/sam2/sam2_hiera_t.yaml",`
			`"sam2_hiera_tiny.pt",`
			`),`
			`"facebook/sam2-hiera-small": (`
			`"configs/sam2/sam2_hiera_s.yaml",`
			`"sam2_hiera_small.pt",`
			`),`
			`"facebook/sam2-hiera-base-plus": (`
			`"configs/sam2/sam2_hiera_b+.yaml",`
			`"sam2_hiera_base_plus.pt",`
			`),`
			`"facebook/sam2-hiera-large": (`
			`"configs/sam2/sam2_hiera_l.yaml",`
			`"sam2_hiera_large.pt",`
			`),`
			`"facebook/sam2.1-hiera-tiny": (`
			`"configs/sam2.1/sam2.1_hiera_t.yaml",`
			`"sam2.1_hiera_tiny.pt",`
			`),`
			`"facebook/sam2.1-hiera-small": (`
			`"configs/sam2.1/sam2.1_hiera_s.yaml",`
			`"sam2.1_hiera_small.pt",`
			`),`
			`"facebook/sam2.1-hiera-base-plus": (`
			`"configs/sam2.1/sam2.1_hiera_b+.yaml",`
			`"sam2.1_hiera_base_plus.pt",`
			`),`
			`"facebook/sam2.1-hiera-large": (`
			`"configs/sam2.1/sam2.1_hiera_l.yaml",`
			`"sam2.1_hiera_large.pt",`
			`),`
			`}`

Initial commit 2024-07-29 21:54:20 +00:00
			`def build_sam2(`
			`config_file,`
			`ckpt_path=None,`
			`device="cuda",`
			`mode="eval",`
			`hydra_overrides_extra=[],`
			`apply_postprocessing=True,`
Fix HF image predictor 2024-08-12 23:41:41 +00:00			`**kwargs,`
Initial commit 2024-07-29 21:54:20 +00:00			`):`

			`if apply_postprocessing:`
			`hydra_overrides_extra = hydra_overrides_extra.copy()`
			`hydra_overrides_extra += [`
			`# dynamically fall back to multi-mask if the single mask is not stable`
			`"++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",`
			`"++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",`
			`"++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",`
			`]`
			`# Read config and init model`
			`cfg = compose(config_name=config_file, overrides=hydra_overrides_extra)`
			`OmegaConf.resolve(cfg)`
			`model = instantiate(cfg.model, _recursive_=True)`
			`_load_checkpoint(model, ckpt_path)`
			`model = model.to(device)`
			`if mode == "eval":`
			`model.eval()`
			`return model`


			`def build_sam2_video_predictor(`
			`config_file,`
			`ckpt_path=None,`
			`device="cuda",`
			`mode="eval",`
			`hydra_overrides_extra=[],`
			`apply_postprocessing=True,`
SAM 2 Update 12/11/2024 -- full model compilation for a major VOS speedup and a new SAM2VideoPredictor to better handle multi-object tracking (#486) This PR provides new features and updates for SAM 2: - We now support `torch.compile` of the entire SAM 2 model on videos, which can be turned on by setting `vos_optimized=True` in `build_sam2_video_predictor` (it uses the new `SAM2VideoPredictorVOS` predictor class in `sam2/sam2_video_predictor.py`). * Compared to the previous setting (which only compiles the image encoder backbone), the new full model compilation gives a major speedup in inference FPS. * In the VOS prediction script `tools/vos_inference.py`, you can specify this option in `tools/vos_inference.py` via the `--use_vos_optimized_video_predictor` flag. * Note that turning on this flag might introduce a small variance in the predictions due to numerical differences caused by `torch.compile` of the full model. * PyTorch 2.5.1 is the minimum version for full support of this feature. (Earlier PyTorch versions might run into compilation errors in some cases.) Therefore, we have updated the minimum PyTorch version to 2.5.1 accordingly in the installation scripts. - We also update the implementation of the `SAM2VideoPredictor` class for the SAM 2 video prediction in `sam2/sam2_video_predictor.py`, which allows for independent per-object inference. Specifically, in the new `SAM2VideoPredictor`: * Now we handle the inference of each object independently (as if we are opening a separate session for each object) while sharing their backbone features. * This change allows us to relax the assumption of prompting for multi-object tracking. Previously (due to the batching behavior in inference), if a video frame receives clicks for only a subset of objects, the rest of the (non-prompted) objects are assumed to be non-existent in this frame (i.e., in such frames, the user is telling SAM 2 that the rest of the objects don't appear). Now, if a frame receives clicks for only a subset of objects, we do not make any assumptions about the remaining (non-prompted) objects (i.e., now each object is handled independently and is not affected by how other objects are prompted). As a result, we allow adding new objects after tracking starts after this change (which was previously a restriction on usage). * We believe that the new version is a more natural inference behavior and therefore switched to it as the default behavior. The previous implementation of `SAM2VideoPredictor` is backed up to in `sam2/sam2_video_predictor_legacy.py`. All the VOS inference results using `tools/vos_inference.py` should remain the same after this change to the `SAM2VideoPredictor` class. 2024-12-11 15:00:55 -08:00			`vos_optimized=False,`
Fix HF image predictor 2024-08-12 23:41:41 +00:00			`**kwargs,`
Initial commit 2024-07-29 21:54:20 +00:00			`):`
			`hydra_overrides = [`
			`"++model._target_=sam2.sam2_video_predictor.SAM2VideoPredictor",`
			`]`
SAM 2 Update 12/11/2024 -- full model compilation for a major VOS speedup and a new SAM2VideoPredictor to better handle multi-object tracking (#486) This PR provides new features and updates for SAM 2: - We now support `torch.compile` of the entire SAM 2 model on videos, which can be turned on by setting `vos_optimized=True` in `build_sam2_video_predictor` (it uses the new `SAM2VideoPredictorVOS` predictor class in `sam2/sam2_video_predictor.py`). * Compared to the previous setting (which only compiles the image encoder backbone), the new full model compilation gives a major speedup in inference FPS. * In the VOS prediction script `tools/vos_inference.py`, you can specify this option in `tools/vos_inference.py` via the `--use_vos_optimized_video_predictor` flag. * Note that turning on this flag might introduce a small variance in the predictions due to numerical differences caused by `torch.compile` of the full model. * PyTorch 2.5.1 is the minimum version for full support of this feature. (Earlier PyTorch versions might run into compilation errors in some cases.) Therefore, we have updated the minimum PyTorch version to 2.5.1 accordingly in the installation scripts. - We also update the implementation of the `SAM2VideoPredictor` class for the SAM 2 video prediction in `sam2/sam2_video_predictor.py`, which allows for independent per-object inference. Specifically, in the new `SAM2VideoPredictor`: * Now we handle the inference of each object independently (as if we are opening a separate session for each object) while sharing their backbone features. * This change allows us to relax the assumption of prompting for multi-object tracking. Previously (due to the batching behavior in inference), if a video frame receives clicks for only a subset of objects, the rest of the (non-prompted) objects are assumed to be non-existent in this frame (i.e., in such frames, the user is telling SAM 2 that the rest of the objects don't appear). Now, if a frame receives clicks for only a subset of objects, we do not make any assumptions about the remaining (non-prompted) objects (i.e., now each object is handled independently and is not affected by how other objects are prompted). As a result, we allow adding new objects after tracking starts after this change (which was previously a restriction on usage). * We believe that the new version is a more natural inference behavior and therefore switched to it as the default behavior. The previous implementation of `SAM2VideoPredictor` is backed up to in `sam2/sam2_video_predictor_legacy.py`. All the VOS inference results using `tools/vos_inference.py` should remain the same after this change to the `SAM2VideoPredictor` class. 2024-12-11 15:00:55 -08:00			`if vos_optimized:`
			`hydra_overrides = [`
			`"++model._target_=sam2.sam2_video_predictor.SAM2VideoPredictorVOS",`
			`"++model.compile_image_encoder=True", # Let sam2_base handle this`
			`]`

Initial commit 2024-07-29 21:54:20 +00:00			`if apply_postprocessing:`
			`hydra_overrides_extra = hydra_overrides_extra.copy()`
			`hydra_overrides_extra += [`
			`# dynamically fall back to multi-mask if the single mask is not stable`
			`"++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",`
			`"++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",`
			`"++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",`
			`# the sigmoid mask logits on interacted frames with clicks in the memory encoder so that the encoded masks are exactly as what users see from clicking`
			`"++model.binarize_mask_from_pts_for_mem_enc=true",`
			# fill small holes in the low-res masks up to `fill_hole_area` (before resizing them to the original video resolution)
			`"++model.fill_hole_area=8",`
			`]`
			`hydra_overrides.extend(hydra_overrides_extra)`

			`# Read config and init model`
			`cfg = compose(config_name=config_file, overrides=hydra_overrides)`
			`OmegaConf.resolve(cfg)`
			`model = instantiate(cfg.model, _recursive_=True)`
			`_load_checkpoint(model, ckpt_path)`
			`model = model.to(device)`
			`if mode == "eval":`
			`model.eval()`
			`return model`


SAM2.1 SAM2.1 checkpoints + training code + Demo 2024-09-28 08:20:56 -07:00			`def _hf_download(model_id):`
Make huggingface_hub soft dependency 2024-08-05 09:37:53 +02:00			`from huggingface_hub import hf_hub_download`

SAM2.1 SAM2.1 checkpoints + training code + Demo 2024-09-28 08:20:56 -07:00			`config_name, checkpoint_name = HF_MODEL_ID_TO_FILENAMES[model_id]`
Add model_id_to_filenames 2024-08-03 14:18:23 +02:00			`ckpt_path = hf_hub_download(repo_id=model_id, filename=checkpoint_name)`
SAM2.1 SAM2.1 checkpoints + training code + Demo 2024-09-28 08:20:56 -07:00			`return config_name, ckpt_path`
First draft 2024-08-03 12:57:05 +02:00

SAM2.1 SAM2.1 checkpoints + training code + Demo 2024-09-28 08:20:56 -07:00			`def build_sam2_hf(model_id, **kwargs):`
			`config_name, ckpt_path = _hf_download(model_id)`
			`return build_sam2(config_file=config_name, ckpt_path=ckpt_path, **kwargs)`
Make huggingface_hub soft dependency 2024-08-05 09:37:53 +02:00

SAM2.1 SAM2.1 checkpoints + training code + Demo 2024-09-28 08:20:56 -07:00			`def build_sam2_video_predictor_hf(model_id, **kwargs):`
			`config_name, ckpt_path = _hf_download(model_id)`
Format using ufmt 2024-08-06 22:43:35 +02:00			`return build_sam2_video_predictor(`
			`config_file=config_name, ckpt_path=ckpt_path, **kwargs`
			`)`
First draft 2024-08-03 12:57:05 +02:00

Initial commit 2024-07-29 21:54:20 +00:00			`def _load_checkpoint(model, ckpt_path):`
			`if ckpt_path is not None:`
Use `weights_only` for loading sam2/build_sam.py:81:14: TOR102 [*] `torch.load` without `weights_only` parameter is unsafe. Explicitly set `weights_only` to False only if you trust the data you load and full pickle functionality is needed, otherwise set `weights_only=True`. Found with https://github.com/pytorch-labs/torchfix/ 2024-07-29 16:54:54 -07:00			`sd = torch.load(ckpt_path, map_location="cpu", weights_only=True)["model"]`
Initial commit 2024-07-29 21:54:20 +00:00			`missing_keys, unexpected_keys = model.load_state_dict(sd)`
			`if missing_keys:`
			`logging.error(missing_keys)`
			`raise RuntimeError()`
			`if unexpected_keys:`
			`logging.error(unexpected_keys)`
			`raise RuntimeError()`
Format using ufmt 2024-08-06 22:43:35 +02:00			`logging.info("Loaded checkpoint sucessfully")`