From 34b17b02802b180da7c71975d5f3da9a6d5fdf4b Mon Sep 17 00:00:00 2001
From: kiennt <trung-kien.nguyen@rizlum.ai>
Date: Thu, 14 Aug 2025 09:26:37 +0000
Subject: [PATCH] feat : Update code, new args

---
 .../models/GroundingDINO/backbone/swin_transformer.py        | 4 ++--
 .../groundingdino/models/GroundingDINO/fuse_modules.py       | 2 +-
 .../groundingdino/models/GroundingDINO/transformer.py        | 3 ++-
 grounding_dino/groundingdino/models/GroundingDINO/utils.py   | 1 +
 grounding_dino/groundingdino/util/box_ops.py                 | 2 +-
 grounding_dino/groundingdino/util/inference.py               | 5 +++--
 training/trainer.py                                          | 5 +++--
 7 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/grounding_dino/groundingdino/models/GroundingDINO/backbone/swin_transformer.py b/grounding_dino/groundingdino/models/GroundingDINO/backbone/swin_transformer.py
index 1c02ac2..eded3ba 100644
--- a/grounding_dino/groundingdino/models/GroundingDINO/backbone/swin_transformer.py
+++ b/grounding_dino/groundingdino/models/GroundingDINO/backbone/swin_transformer.py
@@ -16,7 +16,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
-from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from timm.layers import DropPath, to_2tuple, trunc_normal_
 
 from grounding_dino.groundingdino.util.misc import NestedTensor
 
@@ -113,7 +113,7 @@ class WindowAttention(nn.Module):
         # get pair-wise relative position index for each token inside the window
         coords_h = torch.arange(self.window_size[0])
         coords_w = torch.arange(self.window_size[1])
-        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w], indexing="ij"))  # 2, Wh, Ww
         coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
         relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
         relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
diff --git a/grounding_dino/groundingdino/models/GroundingDINO/fuse_modules.py b/grounding_dino/groundingdino/models/GroundingDINO/fuse_modules.py
index 2753b3d..a5d428c 100644
--- a/grounding_dino/groundingdino/models/GroundingDINO/fuse_modules.py
+++ b/grounding_dino/groundingdino/models/GroundingDINO/fuse_modules.py
@@ -8,7 +8,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from timm.models.layers import DropPath
+from timm.layers import DropPath
 
 
 class FeatureResizer(nn.Module):
diff --git a/grounding_dino/groundingdino/models/GroundingDINO/transformer.py b/grounding_dino/groundingdino/models/GroundingDINO/transformer.py
index 265d3f4..c9cf275 100644
--- a/grounding_dino/groundingdino/models/GroundingDINO/transformer.py
+++ b/grounding_dino/groundingdino/models/GroundingDINO/transformer.py
@@ -470,6 +470,7 @@ class TransformerEncoder(nn.Module):
             ref_y, ref_x = torch.meshgrid(
                 torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
                 torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device),
+                indexing="ij"
             )
             ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
             ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
@@ -859,7 +860,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
         return tensor if pos is None else tensor + pos
 
     def forward_ffn(self, tgt):
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast("cuda", enabled=False):
             tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
         tgt = tgt + self.dropout4(tgt2)
         tgt = self.norm3(tgt)
diff --git a/grounding_dino/groundingdino/models/GroundingDINO/utils.py b/grounding_dino/groundingdino/models/GroundingDINO/utils.py
index 5bd18f7..4634c70 100644
--- a/grounding_dino/groundingdino/models/GroundingDINO/utils.py
+++ b/grounding_dino/groundingdino/models/GroundingDINO/utils.py
@@ -79,6 +79,7 @@ def gen_encoder_output_proposals(
         grid_y, grid_x = torch.meshgrid(
             torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
             torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device),
+            indexing="ij"
         )
         grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)  # H_, W_, 2
 
diff --git a/grounding_dino/groundingdino/util/box_ops.py b/grounding_dino/groundingdino/util/box_ops.py
index 781068d..6159e75 100644
--- a/grounding_dino/groundingdino/util/box_ops.py
+++ b/grounding_dino/groundingdino/util/box_ops.py
@@ -118,7 +118,7 @@ def masks_to_boxes(masks):
 
     y = torch.arange(0, h, dtype=torch.float)
     x = torch.arange(0, w, dtype=torch.float)
-    y, x = torch.meshgrid(y, x)
+    y, x = torch.meshgrid(y, x, indexing="ij")
 
     x_mask = masks * x.unsqueeze(0)
     x_max = x_mask.flatten(1).max(-1)[0]
diff --git a/grounding_dino/groundingdino/util/inference.py b/grounding_dino/groundingdino/util/inference.py
index d6f3b97..7987867 100644
--- a/grounding_dino/groundingdino/util/inference.py
+++ b/grounding_dino/groundingdino/util/inference.py
@@ -63,6 +63,7 @@ def predict(
 
     model = model.to(device)
     image = image.to(device)
+    model.eval()
 
     with torch.no_grad():
         outputs = model(image[None], captions=[caption])
@@ -76,10 +77,10 @@ def predict(
 
     tokenizer = model.tokenizer
     tokenized = tokenizer(caption)
-    
+
     if remove_combined:
         sep_idx = [i for i in range(len(tokenized['input_ids'])) if tokenized['input_ids'][i] in [101, 102, 1012]]
-        
+
         phrases = []
         for logit in logits:
             max_idx = logit.argmax()
diff --git a/training/trainer.py b/training/trainer.py
index 2b7c27b..9d49c2e 100644
--- a/training/trainer.py
+++ b/training/trainer.py
@@ -623,7 +623,7 @@ class Trainer:
 
             # compute output
             with torch.no_grad():
-                with torch.cuda.amp.autocast(
+                with torch.amp.autocast("cuda",
                     enabled=(self.optim_conf.amp.enabled if self.optim_conf else False),
                     dtype=(
                         get_amp_type(self.optim_conf.amp.amp_dtype)
@@ -858,7 +858,8 @@ class Trainer:
         # grads will also update a model even if the step doesn't produce
         # gradients
         self.optim.zero_grad(set_to_none=True)
-        with torch.cuda.amp.autocast(
+        with torch.amp.autocast(
+            "cuda",
             enabled=self.optim_conf.amp.enabled,
             dtype=get_amp_type(self.optim_conf.amp.amp_dtype),
         ):