open README.md with unicode (to support Hugging Face emoji); fix various typos (#218)

(close #217, #66, #67, #69, #91, #126, #127, #145)
This commit is contained in:
Ronghang Hu
2024-08-14 09:06:25 -07:00
committed by GitHub
parent 0db838b117
commit 7e1596c0b6
8 changed files with 11 additions and 11 deletions

View File

@@ -16,7 +16,7 @@ from torch import nn
class PositionEmbeddingSine(nn.Module):
"""
This is a more standard version of the position embedding, very similar to the one
used by the Attention is all you need paper, generalized to work on images.
used by the Attention Is All You Need paper, generalized to work on images.
"""
def __init__(

View File

@@ -642,7 +642,7 @@ class SAM2Base(torch.nn.Module):
pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
return pix_feat_with_mem
# Use a dummy token on the first frame (to avoid emtpy memory input to tranformer encoder)
# Use a dummy token on the first frame (to avoid empty memory input to tranformer encoder)
to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)]
to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)]

View File

@@ -183,7 +183,7 @@ class SAM2ImagePredictor:
normalize_coords=True,
) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
"""This function is very similar to predict(...), however it is used for batched mode, when the model is expected to generate predictions on multiple images.
It returns a tupele of lists of masks, ious, and low_res_masks_logits.
It returns a tuple of lists of masks, ious, and low_res_masks_logits.
"""
assert self._is_batch, "This function should only be used when in batched mode"
if not self._is_image_set:

View File

@@ -44,7 +44,7 @@ class SAM2VideoPredictor(SAM2Base):
offload_state_to_cpu=False,
async_loading_frames=False,
):
"""Initialize a inference state."""
"""Initialize an inference state."""
compute_device = self.device # device of the model
images, video_height, video_width = load_video_frames(
video_path=video_path,
@@ -589,7 +589,7 @@ class SAM2VideoPredictor(SAM2Base):
# to `propagate_in_video_preflight`).
consolidated_frame_inds = inference_state["consolidated_frame_inds"]
for is_cond in [False, True]:
# Separately consolidate conditioning and non-conditioning temp outptus
# Separately consolidate conditioning and non-conditioning temp outputs
storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
# Find all the frames that contain temporary outputs for any objects
# (these should be the frames that have just received clicks for mask inputs
@@ -598,7 +598,7 @@ class SAM2VideoPredictor(SAM2Base):
for obj_temp_output_dict in temp_output_dict_per_obj.values():
temp_frame_inds.update(obj_temp_output_dict[storage_key].keys())
consolidated_frame_inds[storage_key].update(temp_frame_inds)
# consolidate the temprary output across all objects on this frame
# consolidate the temporary output across all objects on this frame
for frame_idx in temp_frame_inds:
consolidated_out = self._consolidate_temp_output_across_obj(
inference_state, frame_idx, is_cond=is_cond, run_mem_encoder=True

View File

@@ -68,7 +68,7 @@ def mask_to_box(masks: torch.Tensor):
compute bounding box given an input mask
Inputs:
- masks: [B, 1, H, W] boxes, dtype=torch.Tensor
- masks: [B, 1, H, W] masks, dtype=torch.Tensor
Returns:
- box_coords: [B, 1, 4], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.Tensor
@@ -120,7 +120,7 @@ class AsyncVideoFrameLoader:
self.offload_video_to_cpu = offload_video_to_cpu
self.img_mean = img_mean
self.img_std = img_std
# items in `self._images` will be loaded asynchronously
# items in `self.images` will be loaded asynchronously
self.images = [None] * len(img_paths)
# catch and raise any exceptions in the async loading thread
self.exception = None

View File

@@ -72,7 +72,7 @@ parser.add_argument(
parser.add_argument(
"--do_not_skip_first_and_last_frame",
help="In SA-V val and test, we skip the first and the last annotated frames in evaluation. "
"Set this to true for evaluation on settings that doen't skip first and last frames",
"Set this to true for evaluation on settings that doesn't skip first and last frames",
action="store_true",
)

View File

@@ -183,7 +183,7 @@ def _seg2bmap(seg, width=None, height=None):
assert not (
width > w | height > h | abs(ar1 - ar2) > 0.01
), "Can" "t convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
), "Cannot convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
e = np.zeros_like(seg)
s = np.zeros_like(seg)

View File

@@ -17,7 +17,7 @@ AUTHOR_EMAIL = "segment-anything@meta.com"
LICENSE = "Apache 2.0"
# Read the contents of README file
with open("README.md", "r") as f:
with open("README.md", "r", encoding="utf-8") as f:
LONG_DESCRIPTION = f.read()
# Required dependencies