open README.md
with unicode (to support Hugging Face emoji); fix various typos (#218)
(close #217, #66, #67, #69, #91, #126, #127, #145)
This commit is contained in:
@@ -16,7 +16,7 @@ from torch import nn
|
|||||||
class PositionEmbeddingSine(nn.Module):
|
class PositionEmbeddingSine(nn.Module):
|
||||||
"""
|
"""
|
||||||
This is a more standard version of the position embedding, very similar to the one
|
This is a more standard version of the position embedding, very similar to the one
|
||||||
used by the Attention is all you need paper, generalized to work on images.
|
used by the Attention Is All You Need paper, generalized to work on images.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@@ -642,7 +642,7 @@ class SAM2Base(torch.nn.Module):
|
|||||||
pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
|
pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
|
||||||
return pix_feat_with_mem
|
return pix_feat_with_mem
|
||||||
|
|
||||||
# Use a dummy token on the first frame (to avoid emtpy memory input to tranformer encoder)
|
# Use a dummy token on the first frame (to avoid empty memory input to tranformer encoder)
|
||||||
to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)]
|
to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)]
|
||||||
to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)]
|
to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)]
|
||||||
|
|
||||||
|
@@ -183,7 +183,7 @@ class SAM2ImagePredictor:
|
|||||||
normalize_coords=True,
|
normalize_coords=True,
|
||||||
) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
|
) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
|
||||||
"""This function is very similar to predict(...), however it is used for batched mode, when the model is expected to generate predictions on multiple images.
|
"""This function is very similar to predict(...), however it is used for batched mode, when the model is expected to generate predictions on multiple images.
|
||||||
It returns a tupele of lists of masks, ious, and low_res_masks_logits.
|
It returns a tuple of lists of masks, ious, and low_res_masks_logits.
|
||||||
"""
|
"""
|
||||||
assert self._is_batch, "This function should only be used when in batched mode"
|
assert self._is_batch, "This function should only be used when in batched mode"
|
||||||
if not self._is_image_set:
|
if not self._is_image_set:
|
||||||
|
@@ -44,7 +44,7 @@ class SAM2VideoPredictor(SAM2Base):
|
|||||||
offload_state_to_cpu=False,
|
offload_state_to_cpu=False,
|
||||||
async_loading_frames=False,
|
async_loading_frames=False,
|
||||||
):
|
):
|
||||||
"""Initialize a inference state."""
|
"""Initialize an inference state."""
|
||||||
compute_device = self.device # device of the model
|
compute_device = self.device # device of the model
|
||||||
images, video_height, video_width = load_video_frames(
|
images, video_height, video_width = load_video_frames(
|
||||||
video_path=video_path,
|
video_path=video_path,
|
||||||
@@ -589,7 +589,7 @@ class SAM2VideoPredictor(SAM2Base):
|
|||||||
# to `propagate_in_video_preflight`).
|
# to `propagate_in_video_preflight`).
|
||||||
consolidated_frame_inds = inference_state["consolidated_frame_inds"]
|
consolidated_frame_inds = inference_state["consolidated_frame_inds"]
|
||||||
for is_cond in [False, True]:
|
for is_cond in [False, True]:
|
||||||
# Separately consolidate conditioning and non-conditioning temp outptus
|
# Separately consolidate conditioning and non-conditioning temp outputs
|
||||||
storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
|
storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
|
||||||
# Find all the frames that contain temporary outputs for any objects
|
# Find all the frames that contain temporary outputs for any objects
|
||||||
# (these should be the frames that have just received clicks for mask inputs
|
# (these should be the frames that have just received clicks for mask inputs
|
||||||
@@ -598,7 +598,7 @@ class SAM2VideoPredictor(SAM2Base):
|
|||||||
for obj_temp_output_dict in temp_output_dict_per_obj.values():
|
for obj_temp_output_dict in temp_output_dict_per_obj.values():
|
||||||
temp_frame_inds.update(obj_temp_output_dict[storage_key].keys())
|
temp_frame_inds.update(obj_temp_output_dict[storage_key].keys())
|
||||||
consolidated_frame_inds[storage_key].update(temp_frame_inds)
|
consolidated_frame_inds[storage_key].update(temp_frame_inds)
|
||||||
# consolidate the temprary output across all objects on this frame
|
# consolidate the temporary output across all objects on this frame
|
||||||
for frame_idx in temp_frame_inds:
|
for frame_idx in temp_frame_inds:
|
||||||
consolidated_out = self._consolidate_temp_output_across_obj(
|
consolidated_out = self._consolidate_temp_output_across_obj(
|
||||||
inference_state, frame_idx, is_cond=is_cond, run_mem_encoder=True
|
inference_state, frame_idx, is_cond=is_cond, run_mem_encoder=True
|
||||||
|
@@ -68,7 +68,7 @@ def mask_to_box(masks: torch.Tensor):
|
|||||||
compute bounding box given an input mask
|
compute bounding box given an input mask
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- masks: [B, 1, H, W] boxes, dtype=torch.Tensor
|
- masks: [B, 1, H, W] masks, dtype=torch.Tensor
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
- box_coords: [B, 1, 4], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.Tensor
|
- box_coords: [B, 1, 4], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.Tensor
|
||||||
@@ -120,7 +120,7 @@ class AsyncVideoFrameLoader:
|
|||||||
self.offload_video_to_cpu = offload_video_to_cpu
|
self.offload_video_to_cpu = offload_video_to_cpu
|
||||||
self.img_mean = img_mean
|
self.img_mean = img_mean
|
||||||
self.img_std = img_std
|
self.img_std = img_std
|
||||||
# items in `self._images` will be loaded asynchronously
|
# items in `self.images` will be loaded asynchronously
|
||||||
self.images = [None] * len(img_paths)
|
self.images = [None] * len(img_paths)
|
||||||
# catch and raise any exceptions in the async loading thread
|
# catch and raise any exceptions in the async loading thread
|
||||||
self.exception = None
|
self.exception = None
|
||||||
|
@@ -72,7 +72,7 @@ parser.add_argument(
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--do_not_skip_first_and_last_frame",
|
"--do_not_skip_first_and_last_frame",
|
||||||
help="In SA-V val and test, we skip the first and the last annotated frames in evaluation. "
|
help="In SA-V val and test, we skip the first and the last annotated frames in evaluation. "
|
||||||
"Set this to true for evaluation on settings that doen't skip first and last frames",
|
"Set this to true for evaluation on settings that doesn't skip first and last frames",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@@ -183,7 +183,7 @@ def _seg2bmap(seg, width=None, height=None):
|
|||||||
|
|
||||||
assert not (
|
assert not (
|
||||||
width > w | height > h | abs(ar1 - ar2) > 0.01
|
width > w | height > h | abs(ar1 - ar2) > 0.01
|
||||||
), "Can" "t convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
|
), "Cannot convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
|
||||||
|
|
||||||
e = np.zeros_like(seg)
|
e = np.zeros_like(seg)
|
||||||
s = np.zeros_like(seg)
|
s = np.zeros_like(seg)
|
||||||
|
2
setup.py
2
setup.py
@@ -17,7 +17,7 @@ AUTHOR_EMAIL = "segment-anything@meta.com"
|
|||||||
LICENSE = "Apache 2.0"
|
LICENSE = "Apache 2.0"
|
||||||
|
|
||||||
# Read the contents of README file
|
# Read the contents of README file
|
||||||
with open("README.md", "r") as f:
|
with open("README.md", "r", encoding="utf-8") as f:
|
||||||
LONG_DESCRIPTION = f.read()
|
LONG_DESCRIPTION = f.read()
|
||||||
|
|
||||||
# Required dependencies
|
# Required dependencies
|
||||||
|
Reference in New Issue
Block a user