feat: Update setup for project

feat : Update code, new args
2025-08-14 09:27:02 +00:00 · 2025-08-14 09:26:37 +00:00
10 changed files with 4470 additions and 14 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -144,4 +144,6 @@ dmypy.json
 *.pth
 outputs/

-.idea/
+.idea/
+tmp/
+data/
--- a/grounding_dino/groundingdino/models/GroundingDINO/backbone/swin_transformer.py
+++ b/grounding_dino/groundingdino/models/GroundingDINO/backbone/swin_transformer.py
@@ -16,7 +16,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
-from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from timm.layers import DropPath, to_2tuple, trunc_normal_

 from grounding_dino.groundingdino.util.misc import NestedTensor

@@ -113,7 +113,7 @@ class WindowAttention(nn.Module):
        # get pair-wise relative position index for each token inside the window
        coords_h = torch.arange(self.window_size[0])
        coords_w = torch.arange(self.window_size[1])
-        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w], indexing="ij"))  # 2, Wh, Ww
        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
--- a/grounding_dino/groundingdino/models/GroundingDINO/fuse_modules.py
+++ b/grounding_dino/groundingdino/models/GroundingDINO/fuse_modules.py
@@ -8,7 +8,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from timm.models.layers import DropPath
+from timm.layers import DropPath


 class FeatureResizer(nn.Module):
--- a/grounding_dino/groundingdino/models/GroundingDINO/transformer.py
+++ b/grounding_dino/groundingdino/models/GroundingDINO/transformer.py
@@ -470,6 +470,7 @@ class TransformerEncoder(nn.Module):
            ref_y, ref_x = torch.meshgrid(
                torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
                torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device),
+                indexing="ij"
            )
            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
@@ -859,7 +860,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
        return tensor if pos is None else tensor + pos

    def forward_ffn(self, tgt):
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast("cuda", enabled=False):
            tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
        tgt = tgt + self.dropout4(tgt2)
        tgt = self.norm3(tgt)
--- a/grounding_dino/groundingdino/models/GroundingDINO/utils.py
+++ b/grounding_dino/groundingdino/models/GroundingDINO/utils.py
@@ -79,6 +79,7 @@ def gen_encoder_output_proposals(
        grid_y, grid_x = torch.meshgrid(
            torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
            torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device),
+            indexing="ij"
        )
        grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)  # H_, W_, 2

--- a/grounding_dino/groundingdino/util/box_ops.py
+++ b/grounding_dino/groundingdino/util/box_ops.py
@@ -118,7 +118,7 @@ def masks_to_boxes(masks):

    y = torch.arange(0, h, dtype=torch.float)
    x = torch.arange(0, w, dtype=torch.float)
-    y, x = torch.meshgrid(y, x)
+    y, x = torch.meshgrid(y, x, indexing="ij")

    x_mask = masks * x.unsqueeze(0)
    x_max = x_mask.flatten(1).max(-1)[0]
--- a/grounding_dino/groundingdino/util/inference.py
+++ b/grounding_dino/groundingdino/util/inference.py
@@ -63,6 +63,7 @@ def predict(

    model = model.to(device)
    image = image.to(device)
+    model.eval()

    with torch.no_grad():
        outputs = model(image[None], captions=[caption])
@@ -76,10 +77,10 @@ def predict(

    tokenizer = model.tokenizer
    tokenized = tokenizer(caption)
-    
+
    if remove_combined:
        sep_idx = [i for i in range(len(tokenized['input_ids'])) if tokenized['input_ids'][i] in [101, 102, 1012]]
-        
+
        phrases = []
        for logit in logits:
            max_idx = logit.argmax()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,68 @@
 [build-system]
-requires = [
-    "setuptools>=62.3.0,<75.9",
-    "torch>=2.3.1",
-    ]
+requires = ["setuptools>=61.0", "wheel"]
 build-backend = "setuptools.build_meta"
+
+[project]
+name = "Grounded-SAM-2"
+version = "1.0"
+description = "Grounded SAM 2: Ground and Track Anything in Videos"
+readme = "README.md"
+requires-python = ">=3.10.0"
+license = { text = "Apache 2.0" }
+authors = [{ name = "Meta AI", email = "segment-anything@meta.com" }]
+keywords = ["segmentation", "computer vision", "deep learning"]
+
+dependencies = [
+    "torch>=2.3.1",
+    "torchvision>=0.18.1",
+    "numpy>=1.24.4",
+    "tqdm>=4.66.1",
+    "hydra-core>=1.3.2",
+    "iopath>=0.1.10",
+    "pillow>=9.4.0",
+    "opencv-python-headless>=4.11.0.86",
+    "supervision>=0.26.1",
+    "pycocotools>=2.0.10",
+    "transformers>=4.55.1",
+    "addict>=2.4.0",
+    "yapf>=0.43.0",
+    "timm>=1.0.19",
+    "pdf2image>=1.17.0",
+]
+
+[project.optional-dependencies]
+notebooks = [
+    "matplotlib>=3.9.1",
+    "jupyter>=1.0.0",
+    "opencv-python>=4.7.0",
+    "eva-decord>=0.6.1",
+]
+interactive-demo = [
+    "Flask>=3.0.3",
+    "Flask-Cors>=5.0.0",
+    "av>=13.0.0",
+    "dataclasses-json>=0.6.7",
+    "eva-decord>=0.6.1",
+    "gunicorn>=23.0.0",
+    "imagesize>=1.4.1",
+    "pycocotools>=2.0.8",
+    "strawberry-graphql>=0.243.0",
+]
+dev = [
+    "black==24.2.0",
+    "usort==1.0.2",
+    "ufmt==2.0.0b2",
+    "fvcore>=0.1.5.post20221221",
+    "pandas>=2.2.2",
+    "scikit-image>=0.24.0",
+    "tensorboard>=2.17.0",
+    "pycocotools>=2.0.8",
+    "tensordict>=0.5.0",
+    "opencv-python>=4.7.0",
+    "submitit>=1.5.1",
+]
+
+
+[tool.setuptools]
+# extensions = [{ name = "sam2._C", sources = ["sam2/csrc/connected_components.cu"] }]
+packages = ["sam2", "grounding_dino"]
--- a/training/trainer.py
+++ b/training/trainer.py
@@ -623,7 +623,7 @@ class Trainer:

            # compute output
            with torch.no_grad():
-                with torch.cuda.amp.autocast(
+                with torch.amp.autocast("cuda",
                    enabled=(self.optim_conf.amp.enabled if self.optim_conf else False),
                    dtype=(
                        get_amp_type(self.optim_conf.amp.amp_dtype)
@@ -858,7 +858,8 @@ class Trainer:
        # grads will also update a model even if the step doesn't produce
        # gradients
        self.optim.zero_grad(set_to_none=True)
-        with torch.cuda.amp.autocast(
+        with torch.amp.autocast(
+            "cuda",
            enabled=self.optim_conf.amp.enabled,
            dtype=get_amp_type(self.optim_conf.amp.amp_dtype),
        ):
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
kiennt	33303aa62f	feat: Update setup for project	2025-08-14 09:27:02 +00:00
kiennt	34b17b0280	feat : Update code, new args	2025-08-14 09:26:37 +00:00