Compare commits
2 Commits
2111d9c52c
...
main
Author | SHA1 | Date | |
---|---|---|---|
![]() |
33303aa62f | ||
![]() |
34b17b0280 |
4
.gitignore
vendored
4
.gitignore
vendored
@@ -144,4 +144,6 @@ dmypy.json
|
|||||||
*.pth
|
*.pth
|
||||||
outputs/
|
outputs/
|
||||||
|
|
||||||
.idea/
|
.idea/
|
||||||
|
tmp/
|
||||||
|
data/
|
||||||
|
@@ -16,7 +16,7 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
import torch.utils.checkpoint as checkpoint
|
import torch.utils.checkpoint as checkpoint
|
||||||
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
|
from timm.layers import DropPath, to_2tuple, trunc_normal_
|
||||||
|
|
||||||
from grounding_dino.groundingdino.util.misc import NestedTensor
|
from grounding_dino.groundingdino.util.misc import NestedTensor
|
||||||
|
|
||||||
@@ -113,7 +113,7 @@ class WindowAttention(nn.Module):
|
|||||||
# get pair-wise relative position index for each token inside the window
|
# get pair-wise relative position index for each token inside the window
|
||||||
coords_h = torch.arange(self.window_size[0])
|
coords_h = torch.arange(self.window_size[0])
|
||||||
coords_w = torch.arange(self.window_size[1])
|
coords_w = torch.arange(self.window_size[1])
|
||||||
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
|
coords = torch.stack(torch.meshgrid([coords_h, coords_w], indexing="ij")) # 2, Wh, Ww
|
||||||
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
|
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
|
||||||
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
|
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
|
||||||
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
|
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
|
||||||
|
@@ -8,7 +8,7 @@
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from timm.models.layers import DropPath
|
from timm.layers import DropPath
|
||||||
|
|
||||||
|
|
||||||
class FeatureResizer(nn.Module):
|
class FeatureResizer(nn.Module):
|
||||||
|
@@ -470,6 +470,7 @@ class TransformerEncoder(nn.Module):
|
|||||||
ref_y, ref_x = torch.meshgrid(
|
ref_y, ref_x = torch.meshgrid(
|
||||||
torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
|
torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
|
||||||
torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device),
|
torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device),
|
||||||
|
indexing="ij"
|
||||||
)
|
)
|
||||||
ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
|
ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
|
||||||
ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
|
ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
|
||||||
@@ -859,7 +860,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|||||||
return tensor if pos is None else tensor + pos
|
return tensor if pos is None else tensor + pos
|
||||||
|
|
||||||
def forward_ffn(self, tgt):
|
def forward_ffn(self, tgt):
|
||||||
with torch.cuda.amp.autocast(enabled=False):
|
with torch.amp.autocast("cuda", enabled=False):
|
||||||
tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
|
tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
|
||||||
tgt = tgt + self.dropout4(tgt2)
|
tgt = tgt + self.dropout4(tgt2)
|
||||||
tgt = self.norm3(tgt)
|
tgt = self.norm3(tgt)
|
||||||
|
@@ -79,6 +79,7 @@ def gen_encoder_output_proposals(
|
|||||||
grid_y, grid_x = torch.meshgrid(
|
grid_y, grid_x = torch.meshgrid(
|
||||||
torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
|
torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
|
||||||
torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device),
|
torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device),
|
||||||
|
indexing="ij"
|
||||||
)
|
)
|
||||||
grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2
|
grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2
|
||||||
|
|
||||||
|
@@ -118,7 +118,7 @@ def masks_to_boxes(masks):
|
|||||||
|
|
||||||
y = torch.arange(0, h, dtype=torch.float)
|
y = torch.arange(0, h, dtype=torch.float)
|
||||||
x = torch.arange(0, w, dtype=torch.float)
|
x = torch.arange(0, w, dtype=torch.float)
|
||||||
y, x = torch.meshgrid(y, x)
|
y, x = torch.meshgrid(y, x, indexing="ij")
|
||||||
|
|
||||||
x_mask = masks * x.unsqueeze(0)
|
x_mask = masks * x.unsqueeze(0)
|
||||||
x_max = x_mask.flatten(1).max(-1)[0]
|
x_max = x_mask.flatten(1).max(-1)[0]
|
||||||
|
@@ -63,6 +63,7 @@ def predict(
|
|||||||
|
|
||||||
model = model.to(device)
|
model = model.to(device)
|
||||||
image = image.to(device)
|
image = image.to(device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
outputs = model(image[None], captions=[caption])
|
outputs = model(image[None], captions=[caption])
|
||||||
@@ -76,10 +77,10 @@ def predict(
|
|||||||
|
|
||||||
tokenizer = model.tokenizer
|
tokenizer = model.tokenizer
|
||||||
tokenized = tokenizer(caption)
|
tokenized = tokenizer(caption)
|
||||||
|
|
||||||
if remove_combined:
|
if remove_combined:
|
||||||
sep_idx = [i for i in range(len(tokenized['input_ids'])) if tokenized['input_ids'][i] in [101, 102, 1012]]
|
sep_idx = [i for i in range(len(tokenized['input_ids'])) if tokenized['input_ids'][i] in [101, 102, 1012]]
|
||||||
|
|
||||||
phrases = []
|
phrases = []
|
||||||
for logit in logits:
|
for logit in logits:
|
||||||
max_idx = logit.argmax()
|
max_idx = logit.argmax()
|
||||||
|
@@ -1,6 +1,68 @@
|
|||||||
[build-system]
|
[build-system]
|
||||||
requires = [
|
requires = ["setuptools>=61.0", "wheel"]
|
||||||
"setuptools>=62.3.0,<75.9",
|
|
||||||
"torch>=2.3.1",
|
|
||||||
]
|
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "Grounded-SAM-2"
|
||||||
|
version = "1.0"
|
||||||
|
description = "Grounded SAM 2: Ground and Track Anything in Videos"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.10.0"
|
||||||
|
license = { text = "Apache 2.0" }
|
||||||
|
authors = [{ name = "Meta AI", email = "segment-anything@meta.com" }]
|
||||||
|
keywords = ["segmentation", "computer vision", "deep learning"]
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
"torch>=2.3.1",
|
||||||
|
"torchvision>=0.18.1",
|
||||||
|
"numpy>=1.24.4",
|
||||||
|
"tqdm>=4.66.1",
|
||||||
|
"hydra-core>=1.3.2",
|
||||||
|
"iopath>=0.1.10",
|
||||||
|
"pillow>=9.4.0",
|
||||||
|
"opencv-python-headless>=4.11.0.86",
|
||||||
|
"supervision>=0.26.1",
|
||||||
|
"pycocotools>=2.0.10",
|
||||||
|
"transformers>=4.55.1",
|
||||||
|
"addict>=2.4.0",
|
||||||
|
"yapf>=0.43.0",
|
||||||
|
"timm>=1.0.19",
|
||||||
|
"pdf2image>=1.17.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
notebooks = [
|
||||||
|
"matplotlib>=3.9.1",
|
||||||
|
"jupyter>=1.0.0",
|
||||||
|
"opencv-python>=4.7.0",
|
||||||
|
"eva-decord>=0.6.1",
|
||||||
|
]
|
||||||
|
interactive-demo = [
|
||||||
|
"Flask>=3.0.3",
|
||||||
|
"Flask-Cors>=5.0.0",
|
||||||
|
"av>=13.0.0",
|
||||||
|
"dataclasses-json>=0.6.7",
|
||||||
|
"eva-decord>=0.6.1",
|
||||||
|
"gunicorn>=23.0.0",
|
||||||
|
"imagesize>=1.4.1",
|
||||||
|
"pycocotools>=2.0.8",
|
||||||
|
"strawberry-graphql>=0.243.0",
|
||||||
|
]
|
||||||
|
dev = [
|
||||||
|
"black==24.2.0",
|
||||||
|
"usort==1.0.2",
|
||||||
|
"ufmt==2.0.0b2",
|
||||||
|
"fvcore>=0.1.5.post20221221",
|
||||||
|
"pandas>=2.2.2",
|
||||||
|
"scikit-image>=0.24.0",
|
||||||
|
"tensorboard>=2.17.0",
|
||||||
|
"pycocotools>=2.0.8",
|
||||||
|
"tensordict>=0.5.0",
|
||||||
|
"opencv-python>=4.7.0",
|
||||||
|
"submitit>=1.5.1",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
# extensions = [{ name = "sam2._C", sources = ["sam2/csrc/connected_components.cu"] }]
|
||||||
|
packages = ["sam2", "grounding_dino"]
|
||||||
|
@@ -623,7 +623,7 @@ class Trainer:
|
|||||||
|
|
||||||
# compute output
|
# compute output
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
with torch.cuda.amp.autocast(
|
with torch.amp.autocast("cuda",
|
||||||
enabled=(self.optim_conf.amp.enabled if self.optim_conf else False),
|
enabled=(self.optim_conf.amp.enabled if self.optim_conf else False),
|
||||||
dtype=(
|
dtype=(
|
||||||
get_amp_type(self.optim_conf.amp.amp_dtype)
|
get_amp_type(self.optim_conf.amp.amp_dtype)
|
||||||
@@ -858,7 +858,8 @@ class Trainer:
|
|||||||
# grads will also update a model even if the step doesn't produce
|
# grads will also update a model even if the step doesn't produce
|
||||||
# gradients
|
# gradients
|
||||||
self.optim.zero_grad(set_to_none=True)
|
self.optim.zero_grad(set_to_none=True)
|
||||||
with torch.cuda.amp.autocast(
|
with torch.amp.autocast(
|
||||||
|
"cuda",
|
||||||
enabled=self.optim_conf.amp.enabled,
|
enabled=self.optim_conf.amp.enabled,
|
||||||
dtype=get_amp_type(self.optim_conf.amp.amp_dtype),
|
dtype=get_amp_type(self.optim_conf.amp.amp_dtype),
|
||||||
):
|
):
|
||||||
|
Reference in New Issue
Block a user