From 34b17b02802b180da7c71975d5f3da9a6d5fdf4b Mon Sep 17 00:00:00 2001 From: kiennt Date: Thu, 14 Aug 2025 09:26:37 +0000 Subject: [PATCH] feat : Update code, new args --- .../models/GroundingDINO/backbone/swin_transformer.py | 4 ++-- .../groundingdino/models/GroundingDINO/fuse_modules.py | 2 +- .../groundingdino/models/GroundingDINO/transformer.py | 3 ++- grounding_dino/groundingdino/models/GroundingDINO/utils.py | 1 + grounding_dino/groundingdino/util/box_ops.py | 2 +- grounding_dino/groundingdino/util/inference.py | 5 +++-- training/trainer.py | 5 +++-- 7 files changed, 13 insertions(+), 9 deletions(-) diff --git a/grounding_dino/groundingdino/models/GroundingDINO/backbone/swin_transformer.py b/grounding_dino/groundingdino/models/GroundingDINO/backbone/swin_transformer.py index 1c02ac2..eded3ba 100644 --- a/grounding_dino/groundingdino/models/GroundingDINO/backbone/swin_transformer.py +++ b/grounding_dino/groundingdino/models/GroundingDINO/backbone/swin_transformer.py @@ -16,7 +16,7 @@ import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint -from timm.models.layers import DropPath, to_2tuple, trunc_normal_ +from timm.layers import DropPath, to_2tuple, trunc_normal_ from grounding_dino.groundingdino.util.misc import NestedTensor @@ -113,7 +113,7 @@ class WindowAttention(nn.Module): # get pair-wise relative position index for each token inside the window coords_h = torch.arange(self.window_size[0]) coords_w = torch.arange(self.window_size[1]) - coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords = torch.stack(torch.meshgrid([coords_h, coords_w], indexing="ij")) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 diff --git a/grounding_dino/groundingdino/models/GroundingDINO/fuse_modules.py b/grounding_dino/groundingdino/models/GroundingDINO/fuse_modules.py index 2753b3d..a5d428c 100644 --- a/grounding_dino/groundingdino/models/GroundingDINO/fuse_modules.py +++ b/grounding_dino/groundingdino/models/GroundingDINO/fuse_modules.py @@ -8,7 +8,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from timm.models.layers import DropPath +from timm.layers import DropPath class FeatureResizer(nn.Module): diff --git a/grounding_dino/groundingdino/models/GroundingDINO/transformer.py b/grounding_dino/groundingdino/models/GroundingDINO/transformer.py index 265d3f4..c9cf275 100644 --- a/grounding_dino/groundingdino/models/GroundingDINO/transformer.py +++ b/grounding_dino/groundingdino/models/GroundingDINO/transformer.py @@ -470,6 +470,7 @@ class TransformerEncoder(nn.Module): ref_y, ref_x = torch.meshgrid( torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device), torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device), + indexing="ij" ) ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_) ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_) @@ -859,7 +860,7 @@ class DeformableTransformerDecoderLayer(nn.Module): return tensor if pos is None else tensor + pos def forward_ffn(self, tgt): - with torch.cuda.amp.autocast(enabled=False): + with torch.amp.autocast("cuda", enabled=False): tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) tgt = tgt + self.dropout4(tgt2) tgt = self.norm3(tgt) diff --git a/grounding_dino/groundingdino/models/GroundingDINO/utils.py b/grounding_dino/groundingdino/models/GroundingDINO/utils.py index 5bd18f7..4634c70 100644 --- a/grounding_dino/groundingdino/models/GroundingDINO/utils.py +++ b/grounding_dino/groundingdino/models/GroundingDINO/utils.py @@ -79,6 +79,7 @@ def gen_encoder_output_proposals( grid_y, grid_x = torch.meshgrid( torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device), torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device), + indexing="ij" ) grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2 diff --git a/grounding_dino/groundingdino/util/box_ops.py b/grounding_dino/groundingdino/util/box_ops.py index 781068d..6159e75 100644 --- a/grounding_dino/groundingdino/util/box_ops.py +++ b/grounding_dino/groundingdino/util/box_ops.py @@ -118,7 +118,7 @@ def masks_to_boxes(masks): y = torch.arange(0, h, dtype=torch.float) x = torch.arange(0, w, dtype=torch.float) - y, x = torch.meshgrid(y, x) + y, x = torch.meshgrid(y, x, indexing="ij") x_mask = masks * x.unsqueeze(0) x_max = x_mask.flatten(1).max(-1)[0] diff --git a/grounding_dino/groundingdino/util/inference.py b/grounding_dino/groundingdino/util/inference.py index d6f3b97..7987867 100644 --- a/grounding_dino/groundingdino/util/inference.py +++ b/grounding_dino/groundingdino/util/inference.py @@ -63,6 +63,7 @@ def predict( model = model.to(device) image = image.to(device) + model.eval() with torch.no_grad(): outputs = model(image[None], captions=[caption]) @@ -76,10 +77,10 @@ def predict( tokenizer = model.tokenizer tokenized = tokenizer(caption) - + if remove_combined: sep_idx = [i for i in range(len(tokenized['input_ids'])) if tokenized['input_ids'][i] in [101, 102, 1012]] - + phrases = [] for logit in logits: max_idx = logit.argmax() diff --git a/training/trainer.py b/training/trainer.py index 2b7c27b..9d49c2e 100644 --- a/training/trainer.py +++ b/training/trainer.py @@ -623,7 +623,7 @@ class Trainer: # compute output with torch.no_grad(): - with torch.cuda.amp.autocast( + with torch.amp.autocast("cuda", enabled=(self.optim_conf.amp.enabled if self.optim_conf else False), dtype=( get_amp_type(self.optim_conf.amp.amp_dtype) @@ -858,7 +858,8 @@ class Trainer: # grads will also update a model even if the step doesn't produce # gradients self.optim.zero_grad(set_to_none=True) - with torch.cuda.amp.autocast( + with torch.amp.autocast( + "cuda", enabled=self.optim_conf.amp.enabled, dtype=get_amp_type(self.optim_conf.amp.amp_dtype), ):