fix: for CUDA version >= 12.6

This commit is contained in:
kiennt
2025-08-16 09:57:17 +00:00
parent e1420f9335
commit 546f444c1c
10 changed files with 1453 additions and 23 deletions

3
.gitignore vendored
View File

@@ -144,3 +144,6 @@ grounding/version.py
vis/
tmp/
data/
*.pth

View File

@@ -0,0 +1,24 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# Define the URLs for the checkpoints
BASE_URL="https://github.com/IDEA-Research/GroundingDINO/releases/download/"
swint_ogc_url="${BASE_URL}v0.1.0-alpha/groundingdino_swint_ogc.pth"
swinb_cogcoor_url="${BASE_URL}v0.1.0-alpha2/groundingdino_swinb_cogcoor.pth"
# Download each of the four checkpoints using wget
echo "Downloading groundingdino_swint_ogc.pth checkpoint..."
wget $swint_ogc_url || { echo "Failed to download checkpoint from $swint_ogc_url"; exit 1; }
echo "Downloading groundingdino_swinb_cogcoor.pth checkpoint..."
wget $swinb_cogcoor_url || { echo "Failed to download checkpoint from $swinb_cogcoor_url"; exit 1; }
echo "All checkpoints are downloaded successfully."

View File

@@ -16,7 +16,7 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
from timm.layers import DropPath, to_2tuple, trunc_normal_
from groundingdino.util.misc import NestedTensor
@@ -445,7 +445,7 @@ class BasicLayer(nn.Module):
for blk in self.blocks:
blk.H, blk.W = H, W
if self.use_checkpoint:
x = checkpoint.checkpoint(blk, x, attn_mask)
x = checkpoint.checkpoint(blk, x, attn_mask, use_reentrant=True)
else:
x = blk(x, attn_mask)
if self.downsample is not None:

View File

@@ -15,6 +15,19 @@
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <torch/extension.h>
#include <torch/version.h>
// Check PyTorch version and define appropriate macros
#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 6
// PyTorch 2.x and above
#define GET_TENSOR_TYPE(x) x.scalar_type()
#define IS_CUDA_TENSOR(x) x.device().is_cuda()
#else
// PyTorch 1.x
#define GET_TENSOR_TYPE(x) x.type()
#define IS_CUDA_TENSOR(x) x.type().is_cuda()
#endif
namespace groundingdino {
@@ -32,11 +45,11 @@ at::Tensor ms_deform_attn_cuda_forward(
AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
AT_ASSERTM(IS_CUDA_TENSOR(value), "value must be a CUDA tensor");
AT_ASSERTM(IS_CUDA_TENSOR(spatial_shapes), "spatial_shapes must be a CUDA tensor");
AT_ASSERTM(IS_CUDA_TENSOR(level_start_index), "level_start_index must be a CUDA tensor");
AT_ASSERTM(IS_CUDA_TENSOR(sampling_loc), "sampling_loc must be a CUDA tensor");
AT_ASSERTM(IS_CUDA_TENSOR(attn_weight), "attn_weight must be a CUDA tensor");
const int batch = value.size(0);
const int spatial_size = value.size(1);
@@ -62,7 +75,7 @@ at::Tensor ms_deform_attn_cuda_forward(
for (int n = 0; n < batch/im2col_step_; ++n)
{
auto columns = output_n.select(0, n);
AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
AT_DISPATCH_FLOATING_TYPES(GET_TENSOR_TYPE(value), "ms_deform_attn_forward_cuda", ([&] {
ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
value.data<scalar_t>() + n * im2col_step_ * per_value_size,
spatial_shapes.data<int64_t>(),
@@ -98,12 +111,12 @@ std::vector<at::Tensor> ms_deform_attn_cuda_backward(
AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
AT_ASSERTM(IS_CUDA_TENSOR(value), "value must be a CUDA tensor");
AT_ASSERTM(IS_CUDA_TENSOR(spatial_shapes), "spatial_shapes must be a CUDA tensor");
AT_ASSERTM(IS_CUDA_TENSOR(level_start_index), "level_start_index must be a CUDA tensor");
AT_ASSERTM(IS_CUDA_TENSOR(sampling_loc), "sampling_loc must be a CUDA tensor");
AT_ASSERTM(IS_CUDA_TENSOR(attn_weight), "attn_weight must be a CUDA tensor");
AT_ASSERTM(IS_CUDA_TENSOR(grad_output), "grad_output must be a CUDA tensor");
const int batch = value.size(0);
const int spatial_size = value.size(1);
@@ -132,7 +145,7 @@ std::vector<at::Tensor> ms_deform_attn_cuda_backward(
for (int n = 0; n < batch/im2col_step_; ++n)
{
auto grad_output_g = grad_output_n.select(0, n);
AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
AT_DISPATCH_FLOATING_TYPES(GET_TENSOR_TYPE(value), "ms_deform_attn_backward_cuda", ([&] {
ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
grad_output_g.data<scalar_t>(),
value.data<scalar_t>() + n * im2col_step_ * per_value_size,

View File

@@ -8,7 +8,7 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from timm.models.layers import DropPath
from timm.layers import DropPath
class FeatureResizer(nn.Module):

View File

@@ -554,6 +554,7 @@ class TransformerEncoder(nn.Module):
memory_text,
key_padding_mask,
text_attention_mask,
use_reentrant=True,
)
else:
output, memory_text = self.fusion_layers[layer_id](
@@ -581,6 +582,7 @@ class TransformerEncoder(nn.Module):
spatial_shapes,
level_start_index,
key_padding_mask,
use_reentrant=True,
)
else:
output = layer(
@@ -859,7 +861,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
return tensor if pos is None else tensor + pos
def forward_ffn(self, tgt):
with torch.cuda.amp.autocast(enabled=False):
with torch.amp.autocast("cuda", enabled=False):
tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
tgt = tgt + self.dropout4(tgt2)
tgt = self.norm3(tgt)

1
groundingdino/version.py Normal file
View File

@@ -0,0 +1 @@
__version__ = '0.1.0'

137
main.py Normal file
View File

@@ -0,0 +1,137 @@
from pathlib import Path
import cv2
import numpy as np
import supervision as sv
import torch
import yaml
from pdf2image import convert_from_bytes, convert_from_path
from PIL import Image
from tqdm import tqdm
from groundingdino.util.inference import Model, preprocess_caption
GROUNDING_DINO_CONFIG = "groundingdino/config/GroundingDINO_SwinT_OGC.py"
GROUNDING_DINO_CHECKPOINT = "gdino_checkpoints/groundingdino_swint_ogc.pth"
BOX_THRESHOLD = 0.4
TEXT_THRESHOLD = 0.25
class PdfConverterService:
"""
A service to convert PDF files to images and resize images,
using pypdfium2 for PDFs and Pillow for images.
"""
def __init__(self, dpi: int):
self._pdfium_initialized = False # Track if PDFium needs explicit init/deinit
self._dpi = dpi
def convert_pdf_to_jpg(self, file_path: str) -> list[Image.Image]:
"""
Converts a PDF file to JPG images at different scales.
"""
pil_images = convert_from_path(file_path, dpi=self._dpi)
return pil_images
def resize_image(self, img: Image.Image, size: tuple[int, int]) -> Image.Image:
"""
Resizes a PIL Image to the specified size.
"""
return img.resize(size, Image.LANCZOS)
@staticmethod
def save_image_as_png(img: Image.Image, file_path: str):
"""
Saves a PIL Image as a PNG file.
"""
img.save(file_path, format="PNG")
@staticmethod
def to_cv2_image(img: Image.Image):
open_cv_image = np.array(img.convert("RGB"))
return open_cv_image[:, :, ::-1].copy()
def convert_pdf_bytes_to_jpg(self, pdf_bytes: bytes) -> list[Image.Image]:
"""
Converts PDF bytes to JPG images at different scales.
"""
pil_images = convert_from_bytes(pdf_bytes, dpi=self._dpi)
return pil_images
def main(
data_dir: str | Path,
text: str = "ID card. Carte Vitale. Bank details. Human face.",
concept_list_yaml: str | None = None,
device: str = "cuda:0" if torch.cuda.is_available() else "cpu",
):
output_dir = Path("outputs") / "extract"
output_dir.mkdir(parents=True, exist_ok=True)
if concept_list_yaml:
print(f"Overriding concepts !")
with open(concept_list_yaml, "r") as f:
concepts = yaml.load(f)
text = "".join([f" {x}." for x in concepts])
print(f"List of concepts to detect: {text}")
if isinstance(data_dir, str):
data_dir = Path(data_dir)
for img_path in tqdm(
data_dir.glob("*.pdf"), total=len(list(data_dir.glob("*.pdf")))
):
pdf_convertor = PdfConverterService(120)
if img_path.suffix == ".pdf":
imgs = pdf_convertor.convert_pdf_to_jpg(str(img_path))
img = imgs[0]
pdf_convertor.save_image_as_png(img, img_path.parent / "test.png")
img = pdf_convertor.to_cv2_image(img)
else:
img = cv2.imread(str(img_path))
# image_source, image = load_image(str(img_path.parent / "test.png"))
grounding_model = Model(
model_config_path=GROUNDING_DINO_CONFIG,
model_checkpoint_path=GROUNDING_DINO_CHECKPOINT,
device=device,
)
caption = preprocess_caption(text)
detections, labels = grounding_model.predict_with_caption(
image=img,
caption=caption,
box_threshold=BOX_THRESHOLD,
text_threshold=TEXT_THRESHOLD,
)
confidences = detections.confidence.tolist()
class_names = labels
labels = [
f"{class_name} {confidence:.2f}"
for class_name, confidence in zip(class_names, confidences)
]
for i, bbox in enumerate(detections.xyxy):
x_min, y_min, x_max, y_max = tuple(bbox)
patch = img[int(y_min) : int(y_max), int(x_min) : int(x_max)]
patch_img_path = str(
Path("outputs") / "extract" / f"{img_path.stem}_{i:d}.png"
)
cv2.imwrite(patch_img_path, patch)
box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
annotated_frame = box_annotator.annotate(
scene=img.copy(), detections=detections
)
label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
annotated_frame = label_annotator.annotate(
scene=annotated_frame, detections=detections, labels=labels
)
cv2.imwrite(str(Path("outputs") / f"{img_path.stem}.png"), annotated_frame)
if __name__ == "__main__":
main("data")

25
pyproject.toml Normal file
View File

@@ -0,0 +1,25 @@
[project]
name = "groundingdino"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"torch>=2.3.1",
"torchvision>=0.18.1",
"numpy>=1.24.4",
"tqdm>=4.66.1",
"hydra-core>=1.3.2",
"iopath>=0.1.10",
"pillow>=9.4.0",
"opencv-python-headless>=4.8.0",
"supervision>=0.26.1",
"pycocotools>=2.0.10",
"transformers>=4.55.1",
"addict>=2.4.0",
"yapf>=0.43.0",
"timm>=1.0.19",
"pdf2image>=1.17.0",
"pip>=25.2",
"setuptools>=80.9.0",
]

1225
uv.lock generated Normal file

File diff suppressed because it is too large Load Diff