Fix demos for CPU inference (#104 )

Change default output dir for HF demo (#105 )
feat:add grounded_sam2_tracking_camera_with_continuous_id.py (closes … (#97 )
2025-05-27 00:24:30 +08:00 · 2025-05-27 00:24:17 +08:00 · 2025-05-08 11:02:33 +08:00 · 2025-05-08 11:02:04 +08:00 · 2025-04-21 01:06:01 +08:00 · 2025-04-20 01:04:26 +08:00
953 changed files with 20928 additions and 73917 deletions
@@ -1,3 +1,15 @@
+# SAM 2
+.vscode/
+.DS_Store
+__pycache__/
+*-checkpoint.ipynb
+.venv
+*.egg*
+build/*
+_C.*
+outputs/*
+checkpoints/*.pt
+*test*
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -14,11 +26,13 @@ dist/
 downloads/
 eggs/
 .eggs/
+lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
+pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
@@ -48,7 +62,6 @@ coverage.xml
 *.py,cover
 .hypothesis/
 .pytest_cache/
-cover/

 # Translations
 *.mo
@@ -71,7 +84,6 @@ instance/
 docs/_build/

 # PyBuilder
-.pybuilder/
 target/

 # Jupyter Notebook
@@ -82,9 +94,7 @@ profile_default/
 ipython_config.py

 # pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
+.python-version

 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
@@ -93,24 +103,7 @@ ipython_config.py
 #   install all needed dependencies.
 #Pipfile.lock

-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
-.pdm.toml
-.pdm-python
-.pdm-build/
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/

 # Celery stuff
@@ -147,20 +140,8 @@ dmypy.json
 # Pyre type checker
 .pyre/

-# pytype static type analyzer
-.pytype/
+# checkpoint
+*.pth
+outputs/

-# Cython debug symbols
-cython_debug/
-
-# evaluation results
-evaluation_results/*
-results/*
-debug/*
-visualization/*
-
-# .DS_Store
-.DS_Store
-
-# For Testing
-demo/
+.idea/
@@ -0,0 +1,37 @@
+FROM pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel
+
+# Arguments to build Docker Image using CUDA
+ARG USE_CUDA=0
+ARG TORCH_ARCH="7.0;7.5;8.0;8.6"
+
+ENV AM_I_DOCKER=True
+ENV BUILD_WITH_CUDA="${USE_CUDA}"
+ENV TORCH_CUDA_ARCH_LIST="${TORCH_ARCH}"
+ENV CUDA_HOME=/usr/local/cuda-12.1/
+# Ensure CUDA is correctly set up
+ENV PATH=/usr/local/cuda-12.1/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64:${LD_LIBRARY_PATH}
+
+# Install required packages and specific gcc/g++
+RUN apt-get update && apt-get install --no-install-recommends wget ffmpeg=7:* \
+    libsm6=2:* libxext6=2:* git=1:* nano vim=2:* ninja-build gcc-10 g++-10 -y \
+    && apt-get clean && apt-get autoremove && rm -rf /var/lib/apt/lists/*
+
+ENV CC=gcc-10
+ENV CXX=g++-10
+
+RUN mkdir -p /home/appuser/Grounded-SAM-2
+COPY . /home/appuser/Grounded-SAM-2/
+
+WORKDIR /home/appuser/Grounded-SAM-2
+
+
+# Install essential Python packages
+RUN python -m pip install --upgrade pip "setuptools>=62.3.0,<75.9" wheel numpy \
+    opencv-python transformers supervision pycocotools addict yapf timm
+
+# Install segment_anything package in editable mode
+RUN python -m pip install -e .
+
+# Install grounding dino 
+RUN python -m pip install --no-build-isolation -e grounding_dino
@@ -186,7 +186,7 @@
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2023 - present, IDEA Research.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
@@ -198,4 +198,4 @@
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
-   limitations under the License.
+   limitations under the License.
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2023 - present, IDEA Research.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
@@ -0,0 +1,37 @@
+# Get version of CUDA and enable it for compilation if CUDA > 11.0
+# This solves https://github.com/IDEA-Research/Grounded-Segment-Anything/issues/53
+# and https://github.com/IDEA-Research/Grounded-Segment-Anything/issues/84
+# when running in Docker
+# Check if nvcc is installed
+NVCC := $(shell which nvcc)
+ifeq ($(NVCC),)
+	# NVCC not found
+	USE_CUDA := 0
+	NVCC_VERSION := "not installed"
+else
+	NVCC_VERSION := $(shell nvcc --version | grep -oP 'release \K[0-9.]+')
+	USE_CUDA := $(shell echo "$(NVCC_VERSION) > 11" | bc -l)
+endif
+
+# Add the list of supported ARCHs
+ifeq ($(USE_CUDA), 1)
+	TORCH_CUDA_ARCH_LIST := "7.0;7.5;8.0;8.6+PTX"
+	BUILD_MESSAGE := "I will try to build the image with CUDA support"
+else
+	TORCH_CUDA_ARCH_LIST :=
+	BUILD_MESSAGE := "CUDA $(NVCC_VERSION) is not supported"
+endif
+
+
+build-image:
+	@echo $(BUILD_MESSAGE)
+	docker build --build-arg USE_CUDA=$(USE_CUDA) \
+	--build-arg TORCH_ARCH=$(TORCH_CUDA_ARCH_LIST) \
+	-t grounded_sam2:1.0 .
+run:
+	docker run --gpus all -it --rm --net=host --privileged \
+	-v /tmp/.X11-unix:/tmp/.X11-unix \
+	-v "${PWD}":/home/appuser/Grounded-SAM-2 \
+	-e DISPLAY=$DISPLAY \
+	--name=gsa \
+	--ipc=host -it grounded_sam2:1.0
@@ -1,28 +1,495 @@
-## Grounded SAMURAI
+# Grounded SAM 2: Ground and Track Anything in Videos
+
+**[IDEA-Research](https://github.com/idea-research)**
+
+[Tianhe Ren](https://rentainhe.github.io/), [Shuo Shen](https://github.com/ShuoShenDe)
+
+[[`SAM 2 Paper`](https://arxiv.org/abs/2408.00714)] [[`Grounding DINO Paper`](https://arxiv.org/abs/2303.05499)] [[`Grounding DINO 1.5 Paper`](https://arxiv.org/abs/2405.10300)] [[`DINO-X Paper`](https://arxiv.org/abs/2411.14347)] [[`BibTeX`](#citation)]
+
+[![Video Name](./assets/grounded_sam_2_intro.jpg)](https://github.com/user-attachments/assets/f0fb0022-779a-49fb-8f46-3a18a8b4e893)
+
+## Highlights
+
+ Grounded SAM 2 is a foundation model pipeline towards grounding and track anything in Videos with [Grounding DINO](https://arxiv.org/abs/2303.05499), [Grounding DINO 1.5](https://arxiv.org/abs/2405.10300), [Florence-2](https://arxiv.org/abs/2311.06242), [DINO-X](https://arxiv.org/abs/2411.14347) and [SAM 2](https://arxiv.org/abs/2408.00714).
+
+In this repo, we've supported the following demo with **simple implementations**:
+- **Ground and Segment Anything** with Grounding DINO, Grounding DINO 1.5 & 1.6, DINO-X and SAM 2
+- **Ground and Track Anything** with Grounding DINO, Grounding DINO 1.5 & 1.6, DINO-X and SAM 2
+- **Detect, Segment and Track Visualization** based on the powerful [supervision](https://github.com/roboflow/supervision) library.
+
+Grounded SAM 2 does not introduce significant methodological changes compared to [Grounded SAM: Assembling Open-World Models for Diverse Visual Tasks](https://arxiv.org/abs/2401.14159). Both approaches leverage the capabilities of open-world models to address complex visual tasks. Consequently, we try to **simplify the code implementation** in this repository, aiming to enhance user convenience.
+
+## Latest updates
+- **2025.04.20**: Update to `dds-cloudapi-sdk` API V2 version. The V1 version in the original API for `Grounding DINO 1.5` and `DINO-X` has been deprecated, please update to the latest `dds-cloudapi-sdk` by `pip install dds-cloudapi-sdk -U` to use `Grounding DINO 1.5 / 1.6` and `DINO-X` models. Please refer to [dds-cloudapi-sdk](https://github.com/deepdataspace/dds-cloudapi-sdk) and our [API docs](https://cloud.deepdataspace.com/docs) to view more details about the update.
+
+- **2024.12.02**: Support **DINO-X with SAM 2** demos (including object segmentation and tracking), please install the latest version of `dds-cloudapi-sdk==0.3.3` and refer to [Grounded SAM 2 (with DINO-X)](#grounded-sam-2-image-demo-with-dino-x) and [Grounded SAM 2 Video (with DINO-X)](#grounded-sam-2-video-object-tracking-demo-with-custom-video-input-with-dino-x) for more details.
+
+- **2024.10.24**: Support [SAHI (Slicing Aided Hyper Inference)](https://docs.ultralytics.com/guides/sahi-tiled-inference/) on Grounded SAM 2 (with Grounding DINO 1.5) which may be helpful for inferencing high resolution image with dense small objects (e.g. **4K** images).
+
+- **2024.10.10**: Support `SAM-2.1` models, if you want to use `SAM 2.1` model, you need to update to the latest code and reinstall SAM 2 follow [SAM 2.1 Installation](https://github.com/facebookresearch/sam2?tab=readme-ov-file#latest-updates).
+
+- **2024.08.31**: Support `dump json results` in Grounded SAM 2 Image Demos (with Grounding DINO).
+
+- **2024.08.20**: Support **Florence-2 SAM 2 Image Demo** which includes `dense region caption`, `object detection`, `phrase grounding`, and cascaded auto-label pipeline `caption + phrase grounding`.
+
+- **2024.08.09**: Support **Ground and Track New Object** throughout the whole videos. This feature is still under development now. Credits to [Shuo Shen](https://github.com/ShuoShenDe).
+
+- **2024.08.07**: Support **Custom Video Inputs**, users need only submit their video file (e.g. `.mp4` file) with specific text prompts to get an impressive demo videos.
+
+## Contents
+- [Installation](#installation)
+- [Grounded SAM 2 Demos](#grounded-sam-2-demos)
+  - [Grounded SAM 2 Image Demo](#grounded-sam-2-image-demo-with-grounding-dino)
+  - [Grounded SAM 2 Image Demo (with Grounding DINO 1.5 & 1.6)](#grounded-sam-2-image-demo-with-grounding-dino-15--16)
+  - [Grounded SAM 2 Image Demo (with DINO-X)](#grounded-sam-2-image-demo-with-dino-x)
+  - [Grounded SAM 2 with SAHI for High Resolution Image Inference](#sahi-slicing-aided-hyper-inference-with-grounding-dino-15-and-sam-2)
+  - [Automatically Saving Grounding and Segmentation Results](#automatically-saving-grounding-results-image-demo)
+  - [Grounded SAM 2 Video Object Tracking Demo](#grounded-sam-2-video-object-tracking-demo)
+  - [Grounded SAM 2 Video Object Tracking Demo (with Grounding DINO 1.5 & 1.6)](#grounded-sam-2-video-object-tracking-demo-with-grounding-dino-15--16)
+  - [Grounded SAM 2 Video Object Tracking with Custom Video Input (using Grounding DINO)](#grounded-sam-2-video-object-tracking-demo-with-custom-video-input-with-grounding-dino)
+  - [Grounded SAM 2 Video Object Tracking with Custom Video Input (using Grounding DINO 1.5 & 1.6)](#grounded-sam-2-video-object-tracking-demo-with-custom-video-input-with-grounding-dino-15--16)
+  - [Grounded SAM 2 Video Object Tracking Demo (with DINO-X)](#grounded-sam-2-video-object-tracking-demo-with-custom-video-input-with-dino-x)
+  - [Grounded SAM 2 Video Object Tracking with Continues ID (using Grounding DINO)](#grounded-sam-2-video-object-tracking-with-continuous-id-with-grounding-dino)
+- [Grounded SAM 2 Florence-2 Demos](#grounded-sam-2-florence-2-demos)
+  - [Grounded SAM 2 Florence-2 Image Demo](#grounded-sam-2-florence-2-image-demo)
+  - [Grounded SAM 2 Florence-2 Image Auto-Labeling Demo](#grounded-sam-2-florence-2-image-auto-labeling-demo)
+- [Citation](#citation)

-We have tried to implement Grounded SAMURAI for long video object tracking and segmentation.

-[![Video Name]()](https://github.com/user-attachments/assets/51db13b6-1083-4c22-af14-c34e09403591)

 ## Installation

-### Install SAMURAI
-Please refer to [SAMURAI Install](./SAMURAI_README.md) for more details.
+Download the pretrained `SAM 2` checkpoints:

-### Register on Offical Website to Get API Token
+```bash
+cd checkpoints
+bash download_ckpts.sh
+```

- **First-Time Application**: If you are interested in our project and wish to try our algorithm, you will need to apply for the corresponding API Token through our [request API token website](https://cloud.deepdataspace.com/apply-token?from=github) for your first attempt.
+Download the pretrained `Grounding DINO` checkpoints:

- **Request Additional Token Quotas**: If you find our project helpful and need more API token quotas, you can request additional tokens by [filling out this form](https://docs.google.com/forms/d/e/1FAIpQLSfjogAtkgoVyFX9wvCAE15mD7QtHdKdKOrVmcE5GT1xu-03Aw/viewform?usp=sf_link). Our team will review your request and allocate more tokens for your use in one or two days. You can also apply for more tokens by sending us an email.
+```bash
+cd gdino_checkpoints
+bash download_ckpts.sh
+```

-**Note:** If you encounter some errors with API, please install the latest version of `dds-cloudapi-sdk`:
+### Installation without docker
+
+Install PyTorch environment first. We use `python=3.10`, as well as `torch >= 2.3.1`, `torchvision>=0.18.1` and `cuda-12.1` in our environment to run this demo. Please follow the instructions [here](https://pytorch.org/get-started/locally/) to install both PyTorch and TorchVision dependencies. Installing both PyTorch and TorchVision with CUDA support is strongly recommended. You can easily install the latest version of PyTorch as follows:
+
+```bash
+pip3 install torch torchvision torchaudio
+```
+
+Since we need the CUDA compilation environment to compile the `Deformable Attention` operator used in Grounding DINO, we need to check whether the CUDA environment variables have been set correctly (which you can refer to [Grounding DINO Installation](https://github.com/IDEA-Research/GroundingDINO?tab=readme-ov-file#hammer_and_wrench-install) for more details). You can set the environment variable manually as follows if you want to build a local GPU environment for Grounding DINO to run Grounded SAM 2:
+
+```bash
+export CUDA_HOME=/path/to/cuda-12.1/
+```
+
+Install `Segment Anything 2`:
+
+```bash
+pip install -e .
+```
+
+Install `Grounding DINO`:
+
+```bash
+pip install --no-build-isolation -e grounding_dino
+```
+
+### Installation with docker
+Build the Docker image and Run the Docker container:
+
+```
+cd Grounded-SAM-2
+make build-image
+make run
+```
+After executing these commands, you will be inside the Docker environment. The working directory within the container is set to: `/home/appuser/Grounded-SAM-2`
+
+Once inside the Docker environment, you can start the demo by running:
+```
+python grounded_sam2_tracking_demo.py
+```
+
+## Grounded SAM 2 Demos
+### Grounded SAM 2 Image Demo (with Grounding DINO)
+Note that `Grounding DINO` has already been supported in [Huggingface](https://huggingface.co/IDEA-Research/grounding-dino-tiny), so we provide two choices for running `Grounded SAM 2` model:
+- Use huggingface API to inference Grounding DINO (which is simple and clear)
+
+```bash
+python grounded_sam2_hf_model_demo.py
+```
+
+> [!NOTE]
+> 🚨 If you encounter network issues while using the `HuggingFace` model, you can resolve them by setting the appropriate mirror source as `export HF_ENDPOINT=https://hf-mirror.com`
+
+- Load local pretrained Grounding DINO checkpoint and inference with Grounding DINO original API (make sure you've already downloaded the pretrained checkpoint)
+
+```bash
+python grounded_sam2_local_demo.py
+```
+
+
+### Grounded SAM 2 Image Demo (with Grounding DINO 1.5 & 1.6)
+
+We've already released our most capable open-set detection model [Grounding DINO 1.5 & 1.6](https://github.com/IDEA-Research/Grounding-DINO-1.5-API), which can be combined with SAM 2 for stronger open-set detection and segmentation capability. You can apply the API token first and run Grounded SAM 2 with Grounding DINO 1.5 as follows:
+
+Install the latest DDS cloudapi:

 ```bash
 pip install dds-cloudapi-sdk --upgrade
-``` 
+```

-### Demos
+Apply your API token from our official website here: [request API token](https://deepdataspace.com/request_api).

 ```bash
-python grounded_samurai_dinox.py
+python grounded_sam2_gd1.5_demo.py
+```
+
+### SAHI (Slicing Aided Hyper Inference) with Grounding DINO 1.5 and SAM 2
+
+If your images are high resolution with dense objects, directly using Grounding DINO 1.5 for inference on the original image may not be the best choice. We support [SAHI (Slicing Aided Hyper Inference)](https://docs.ultralytics.com/guides/sahi-tiled-inference/), which works by first dividing the original image into smaller overlapping patches. Inference is then performed separately on each patch, and the final detection results are merged. This method is highly effective and accuracy for dense and small objects detection in high resolution images.
+
+You can run SAHI inference by setting the following param in [grounded_sam2_gd1.5_demo.py](./grounded_sam2_gd1.5_demo.py):
+
+```python
+WITH_SLICE_INFERENCE = True
+```
+
+The visualization is shown as follows:
+
+| Text Prompt | Input Image | Grounded SAM 2 | Grounded SAM 2 with SAHI |
+|:----:|:----:|:----:|:----:|
+| `Person` | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam_2/demo_images/dense%20people.png?raw=true) | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam_2/grounding_dino_1.5_slice_inference/grounded_sam2_annotated_image_with_mask.jpg?raw=true) | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam_2/grounding_dino_1.5_slice_inference/grounded_sam2_annotated_image_with_mask_with_slice_inference.jpg?raw=true) |
+
+- **Notes:** We only support SAHI on Grounding DINO 1.5 because it works better with stronger grounding model which may produce less hallucination results.
+
+### Grounded SAM 2 Image Demo (with DINO-X)
+
+We've implemented Grounded SAM 2 with the strongest open-world perception model [DINO-X](https://github.com/IDEA-Research/DINO-X-API) for better open-set detection and segmentation performance. You can apply the API token first and run Grounded SAM 2 with DINO-X as follows:
+
+Install the latest DDS cloudapi:
+
+```bash
+pip install dds-cloudapi-sdk --upgrade
+```
+
+Apply your API token from our official website here: [request API token](https://deepdataspace.com/request_api).
+
+```bash
+python grounded_sam2_dinox_demo.py
+```
+
+### Automatically Saving Grounding Results (Image Demo)
+
+After setting `DUMP_JSON_RESULTS=True` in the following Grounded SAM 2 Image Demos:
+- [grounded_sam2_local_demo.py](./grounded_sam2_local_demo.py)
+- [grounded_sam2_hf_model_demo.py](./grounded_sam2_hf_model_demo.py)
+- [grounded_sam2_gd1.5_demo.py](./grounded_sam2_gd1.5_demo.py)
+- [grounded_sam2_dinox_demo.py](./grounded_sam2_dinox_demo.py)
+
+The `grounding` and `segmentation` results will be automatically saved in the `outputs` dir with the following format:
+
+```python
+{
+    "image_path": "path/to/image.jpg",
+    "annotations": [
+        {
+            "class_name": "class_name",
+            "bbox": [x1, y1, x2, y2],
+            "segmentation": {
+                "size": [h, w],
+                "counts": "rle_encoded_mask"
+            },
+            "score": confidence score
+        }
+    ],
+    "box_format": "xyxy",
+    "img_width": w,
+    "img_height": h
+}
+```
+
+
+
+### Grounded SAM 2 Video Object Tracking Demo
+
+Based on the strong tracking capability of SAM 2, we can combined it with Grounding DINO for open-set object segmentation and tracking. You can run the following scripts to get the tracking results with Grounded SAM 2:
+
+```bash
+python grounded_sam2_tracking_demo.py
+```
+
+- The tracking results of each frame will be saved in `./tracking_results`
+- The video will be save as `children_tracking_demo_video.mp4`
+- You can refine this file with different text prompt and video clips yourself to get more tracking results.
+- We only prompt the first video frame with Grounding DINO here for simple usage.
+
+#### Support Various Prompt Type for Tracking
+
+We've supported different types of prompt for Grounded SAM 2 tracking demo:
+
+- **Point Prompt**: In order to **get a stable segmentation results**, we re-use the SAM 2 image predictor to get the prediction mask from each object based on Grounding DINO box outputs, then we **uniformly sample points from the prediction mask** as point prompts for SAM 2 video predictor
+- **Box Prompt**: We directly use the box outputs from Grounding DINO as box prompts for SAM 2 video predictor
+- **Mask Prompt**: We use the SAM 2 mask prediction results based on Grounding DINO box outputs as mask prompt for SAM 2 video predictor.
+
+![Grounded SAM 2 Tracking Pipeline](./assets/g_sam2_tracking_pipeline_vis_new.png)
+
+
+### Grounded SAM 2 Video Object Tracking Demo (with Grounding DINO 1.5 & 1.6)
+
+We've also support video object tracking demo based on our stronger `Grounding DINO 1.5` model and `SAM 2`, you can try the following demo after applying the API keys for running `Grounding DINO 1.5`:
+
+```bash
+python grounded_sam2_tracking_demo_with_gd1.5.py
+```
+
+### Grounded SAM 2 Video Object Tracking Demo with Custom Video Input (with Grounding DINO)
+
+Users can upload their own video file (e.g. `assets/hippopotamus.mp4`) and specify their custom text prompts for grounding and tracking with Grounding DINO and SAM 2 by using the following scripts:
+
+```bash
+python grounded_sam2_tracking_demo_custom_video_input_gd1.0_hf_model.py
+```
+
+If you are not convenient to use huggingface demo, you can also run tracking demo with local grounding dino model with the following scripts:
+
+```bash
+python grounded_sam2_tracking_demo_custom_video_input_gd1.0_local_model.py
+```
+
+### Grounded SAM 2 Video Object Tracking Demo with Custom Video Input (with Grounding DINO 1.5 & 1.6)
+
+Users can upload their own video file (e.g. `assets/hippopotamus.mp4`) and specify their custom text prompts for grounding and tracking with Grounding DINO 1.5 and SAM 2 by using the following scripts:
+
+```bash
+python grounded_sam2_tracking_demo_custom_video_input_gd1.5.py
+```
+
+You can specify the params in this file:
+
+```python
+VIDEO_PATH = "./assets/hippopotamus.mp4"
+TEXT_PROMPT = "hippopotamus."
+OUTPUT_VIDEO_PATH = "./hippopotamus_tracking_demo.mp4"
+API_TOKEN_FOR_GD1_5 = "Your API token" # api token for G-DINO 1.5
+PROMPT_TYPE_FOR_VIDEO = "mask" # using SAM 2 mask prediction as prompt for video predictor
+```
+
+After running our demo code, you can get the tracking results as follows:
+
+[![Video Name](./assets/hippopotamus_seg.jpg)](https://github.com/user-attachments/assets/1fbdc6f4-3e50-4221-9600-98c397beecdf)
+
+And we will automatically save the tracking visualization results in `OUTPUT_VIDEO_PATH`.
+
+> [!WARNING]
+> We initialize the box prompts on the first frame of the input video. If you want to start from different frame, you can refine `ann_frame_idx` by yourself in our code.
+
+### Grounded SAM 2 Video Object Tracking Demo with Custom Video Input (with DINO-X)
+
+Users can upload their own video file (e.g. `assets/hippopotamus.mp4`) and specify their custom text prompts for grounding and tracking with DINO-X and SAM 2 by using the following scripts:
+
+```bash
+python grounded_sam2_tracking_demo_custom_video_input_dinox.py
+```
+
+### Grounded-SAM-2 Video Object Tracking with Continuous ID (with Grounding DINO)
+
+In above demos, we only prompt Grounded SAM 2 in specific frame, which may not be friendly to find new object during the whole video. In this demo, we try to **find new objects** and assign them with new ID across the whole video, this function is **still under develop**. it's not that stable now.
+
+Users can upload their own video files and specify custom text prompts for grounding and tracking using the Grounding DINO and SAM 2 frameworks. To do this, execute the script:
+
+
+```bash 
+python grounded_sam2_tracking_demo_with_continuous_id.py
+```
+
+You can customize various parameters including:
+
+- `text`: The grounding text prompt.
+- `video_dir`: Directory containing the video files.
+- `output_dir`: Directory to save the processed output.
+- `output_video_path`: Path for the output video.
+- `step`: Frame stepping for processing.
+- `box_threshold`: box threshold for groundingdino model
+- `text_threshold`: text threshold for groundingdino model
+Note: This method supports only the mask type of text prompt.
+
+After running our demo code, you can get the tracking results as follows:
+
+[![Video Name](./assets/tracking_car_mask_1.jpg)](https://github.com/user-attachments/assets/d3f91ad0-3d32-43c4-a0dc-0bed661415f4)
+
+If you want to try `Grounding DINO 1.5` model, you can run the following scripts after setting your API token:
+
+```bash
+python grounded_sam2_tracking_demo_with_continuous_id_gd1.5.py
+```
+
+### Grounded-SAM-2 Video Object Tracking with Continuous ID plus Reverse Tracking(with Grounding DINO)
+This method could simply cover the whole lifetime of the object
+```bash
+python grounded_sam2_tracking_demo_with_continuous_id_plus.py
+
+```
+
+### Grounded-SAM-2 Real-Time Object Tracking with Continuous ID (Live Video / Camera Stream)
+
+This method enables **real-time object tracking** with **ID continuity** from a live camera or video stream. 
+
+```bash
+python grounded_sam2_tracking_camera_with_continuous_id.py
+```
+
+
+
+## Grounded SAM 2 Florence-2 Demos
+### Grounded SAM 2 Florence-2 Image Demo
+
+In this section, we will explore how to integrate the feature-rich and robust open-source models [Florence-2](https://arxiv.org/abs/2311.06242) and SAM 2 to develop practical applications.
+
+[Florence-2](https://arxiv.org/abs/2311.06242) is a powerful vision foundation model by Microsoft which supports a series of vision tasks by prompting with special `task_prompt` includes but not limited to:
+
+| Task | Task Prompt | Text Input | Task Introduction |
+|:---:|:---:|:---:|:---:|
+| Object Detection | `<OD>` | &#10008; | Detect main objects with single category name |
+| Dense Region Caption | `<DENSE_REGION_CAPTION>` | &#10008; | Detect main objects with short description |
+| Region Proposal | `<REGION_PROPOSAL>` | &#10008; | Generate proposals without category name |
+| Phrase Grounding | `<CAPTION_TO_PHRASE_GROUNDING>` | &#10004; | Ground main objects in image mentioned in caption |
+| Referring Expression Segmentation | `<REFERRING_EXPRESSION_SEGMENTATION>` | &#10004; | Ground the object which is most related to the text input |
+| Open Vocabulary Detection and Segmentation | `<OPEN_VOCABULARY_DETECTION>` | &#10004; | Ground any object with text input |
+
+
+Integrate `Florence-2` with `SAM-2`, we can build a strong vision pipeline to solve complex vision tasks, you can try the following scripts to run the demo:
+
+> [!NOTE]
+> 🚨 If you encounter network issues while using the `HuggingFace` model, you can resolve them by setting the appropriate mirror source as `export HF_ENDPOINT=https://hf-mirror.com`
+
+**Object Detection and Segmentation**
+```bash
+python grounded_sam2_florence2_image_demo.py \
+    --pipeline object_detection_segmentation \
+    --image_path ./notebooks/images/cars.jpg
+```
+
+**Dense Region Caption and Segmentation**
+```bash
+python grounded_sam2_florence2_image_demo.py \
+    --pipeline dense_region_caption_segmentation \
+    --image_path ./notebooks/images/cars.jpg
+```
+
+**Region Proposal and Segmentation**
+```bash
+python grounded_sam2_florence2_image_demo.py \
+    --pipeline region_proposal_segmentation \
+    --image_path ./notebooks/images/cars.jpg
+```
+
+**Phrase Grounding and Segmentation**
+```bash
+python grounded_sam2_florence2_image_demo.py \
+    --pipeline phrase_grounding_segmentation \
+    --image_path ./notebooks/images/cars.jpg \
+    --text_input "The image shows two vintage Chevrolet cars parked side by side, with one being a red convertible and the other a pink sedan, \
+            set against the backdrop of an urban area with a multi-story building and trees. \
+            The cars have Cuban license plates, indicating a location likely in Cuba."
+```
+
+**Referring Expression Segmentation**
+```bash
+python grounded_sam2_florence2_image_demo.py \
+    --pipeline referring_expression_segmentation \
+    --image_path ./notebooks/images/cars.jpg \
+    --text_input "The left red car."
+```
+
+**Open-Vocabulary Detection and Segmentation**
+```bash
+python grounded_sam2_florence2_image_demo.py \
+    --pipeline open_vocabulary_detection_segmentation \
+    --image_path ./notebooks/images/cars.jpg \
+    --text_input "car <and> building"
+```
+- Note that if you want to **detect multiple classes** you should split them with `<and>` in your input text.
+
+
+### Grounded SAM 2 Florence-2 Image Auto-Labeling Demo
+`Florence-2` can be used as a auto image annotator by cascading its caption capability with its grounding capability. 
+
+| Task | Task Prompt | Text Input |
+|:---:|:---:|:---:|
+| Caption + Phrase Grounding | `<CAPTION>` + `<CAPTION_TO_PHRASE_GROUNDING>` | &#10008; |
+| Detailed Caption + Phrase Grounding | `<DETAILED_CAPTION>` + `<CAPTION_TO_PHRASE_GROUNDING>` | &#10008; |
+| More Detailed Caption + Phrase Grounding | `<MORE_DETAILED_CAPTION>` + `<CAPTION_TO_PHRASE_GROUNDING>` | &#10008; |
+
+You can try the following scripts to run these demo:
+
+**Caption to Phrase Grounding**
+```bash
+python grounded_sam2_florence2_autolabel_pipeline.py \
+    --image_path ./notebooks/images/groceries.jpg \
+    --pipeline caption_to_phrase_grounding \
+    --caption_type caption
+```
+
+- You can specify `caption_type` to control the granularity of the caption, if you want a more detailed caption, you can try `--caption_type detailed_caption` or `--caption_type more_detailed_caption`.
+
+### Citation
+
+If you find this project helpful for your research, please consider citing the following BibTeX entry.
+
+```BibTex
+@misc{ravi2024sam2segmentimages,
+      title={SAM 2: Segment Anything in Images and Videos}, 
+      author={Nikhila Ravi and Valentin Gabeur and Yuan-Ting Hu and Ronghang Hu and Chaitanya Ryali and Tengyu Ma and Haitham Khedr and Roman Rädle and Chloe Rolland and Laura Gustafson and Eric Mintun and Junting Pan and Kalyan Vasudev Alwala and Nicolas Carion and Chao-Yuan Wu and Ross Girshick and Piotr Dollár and Christoph Feichtenhofer},
+      year={2024},
+      eprint={2408.00714},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2408.00714}, 
+}
+
+@article{liu2023grounding,
+  title={Grounding dino: Marrying dino with grounded pre-training for open-set object detection},
+  author={Liu, Shilong and Zeng, Zhaoyang and Ren, Tianhe and Li, Feng and Zhang, Hao and Yang, Jie and Li, Chunyuan and Yang, Jianwei and Su, Hang and Zhu, Jun and others},
+  journal={arXiv preprint arXiv:2303.05499},
+  year={2023}
+}
+
+@misc{ren2024grounding,
+      title={Grounding DINO 1.5: Advance the "Edge" of Open-Set Object Detection}, 
+      author={Tianhe Ren and Qing Jiang and Shilong Liu and Zhaoyang Zeng and Wenlong Liu and Han Gao and Hongjie Huang and Zhengyu Ma and Xiaoke Jiang and Yihao Chen and Yuda Xiong and Hao Zhang and Feng Li and Peijun Tang and Kent Yu and Lei Zhang},
+      year={2024},
+      eprint={2405.10300},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+@misc{ren2024grounded,
+      title={Grounded SAM: Assembling Open-World Models for Diverse Visual Tasks}, 
+      author={Tianhe Ren and Shilong Liu and Ailing Zeng and Jing Lin and Kunchang Li and He Cao and Jiayu Chen and Xinyu Huang and Yukang Chen and Feng Yan and Zhaoyang Zeng and Hao Zhang and Feng Li and Jie Yang and Hongyang Li and Qing Jiang and Lei Zhang},
+      year={2024},
+      eprint={2401.14159},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+@article{kirillov2023segany,
+  title={Segment Anything}, 
+  author={Kirillov, Alexander and Mintun, Eric and Ravi, Nikhila and Mao, Hanzi and Rolland, Chloe and Gustafson, Laura and Xiao, Tete and Whitehead, Spencer and Berg, Alexander C. and Lo, Wan-Yen and Doll{\'a}r, Piotr and Girshick, Ross},
+  journal={arXiv:2304.02643},
+  year={2023}
+}
+
+@misc{jiang2024trex2,
+      title={T-Rex2: Towards Generic Object Detection via Text-Visual Prompt Synergy}, 
+      author={Qing Jiang and Feng Li and Zhaoyang Zeng and Tianhe Ren and Shilong Liu and Lei Zhang},
+      year={2024},
+      eprint={2403.14610},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
 ```
@@ -0,0 +1,140 @@
+# SAM 2: Segment Anything in Images and Videos
+
+**[AI at Meta, FAIR](https://ai.meta.com/research/)**
+
+[Nikhila Ravi](https://nikhilaravi.com/), [Valentin Gabeur](https://gabeur.github.io/), [Yuan-Ting Hu](https://scholar.google.com/citations?user=E8DVVYQAAAAJ&hl=en), [Ronghang Hu](https://ronghanghu.com/), [Chaitanya Ryali](https://scholar.google.com/citations?user=4LWx24UAAAAJ&hl=en), [Tengyu Ma](https://scholar.google.com/citations?user=VeTSl0wAAAAJ&hl=en), [Haitham Khedr](https://hkhedr.com/), [Roman Rädle](https://scholar.google.de/citations?user=Tpt57v0AAAAJ&hl=en), [Chloe Rolland](https://scholar.google.com/citations?hl=fr&user=n-SnMhoAAAAJ), [Laura Gustafson](https://scholar.google.com/citations?user=c8IpF9gAAAAJ&hl=en), [Eric Mintun](https://ericmintun.github.io/), [Junting Pan](https://junting.github.io/), [Kalyan Vasudev Alwala](https://scholar.google.co.in/citations?user=m34oaWEAAAAJ&hl=en), [Nicolas Carion](https://www.nicolascarion.com/), [Chao-Yuan Wu](https://chaoyuan.org/), [Ross Girshick](https://www.rossgirshick.info/), [Piotr Dollár](https://pdollar.github.io/), [Christoph Feichtenhofer](https://feichtenhofer.github.io/)
+
+[[`Paper`](https://ai.meta.com/research/publications/sam-2-segment-anything-in-images-and-videos/)] [[`Project`](https://ai.meta.com/sam2)] [[`Demo`](https://sam2.metademolab.com/)] [[`Dataset`](https://ai.meta.com/datasets/segment-anything-video)] [[`Blog`](https://ai.meta.com/blog/segment-anything-2)] [[`BibTeX`](#citing-sam-2)]
+
+![SAM 2 architecture](assets/model_diagram.png?raw=true)
+
+**Segment Anything Model 2 (SAM 2)** is a foundation model towards solving promptable visual segmentation in images and videos. We extend SAM to video by considering images as a video with a single frame. The model design is a simple transformer architecture with streaming memory for real-time video processing. We build a model-in-the-loop data engine, which improves model and data via user interaction, to collect [**our SA-V dataset**](https://ai.meta.com/datasets/segment-anything-video), the largest video segmentation dataset to date. SAM 2 trained on our data provides strong performance across a wide range of tasks and visual domains.
+
+![SA-V dataset](assets/sa_v_dataset.jpg?raw=true)
+
+## Installation
+
+Please install SAM 2 on a GPU machine using:
+
+```bash
+git clone https://github.com/facebookresearch/segment-anything-2.git
+
+cd segment-anything-2; pip install -e .
+```
+
+To use the SAM 2 predictor and run the example notebooks, `jupyter` and `matplotlib` are required and can be installed by:
+
+```bash
+pip install -e ".[demo]"
+```
+
+## Getting Started
+
+### Download Checkpoints
+
+First, we need to download a model checkpoint. All the model checkpoints can be downloaded by running:
+
+```bash
+cd checkpoints
+./download_ckpts.sh
+```
+
+or individually from:
+
+- [sam2_hiera_tiny.pt](https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_tiny.pt)
+- [sam2_hiera_small.pt](https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_small.pt)
+- [sam2_hiera_base_plus.pt](https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_base_plus.pt)
+- [sam2_hiera_large.pt](https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt)
+
+Then SAM 2 can be used in a few lines as follows for image and video prediction.
+
+### Image prediction
+
+SAM 2 has all the capabilities of [SAM](https://github.com/facebookresearch/segment-anything) on static images, and we provide image prediction APIs that closely resemble SAM for image use cases. The `SAM2ImagePredictor` class has an easy interface for image prompting.
+
+```python
+import torch
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+
+checkpoint = "./checkpoints/sam2_hiera_large.pt"
+model_cfg = "sam2_hiera_l.yaml"
+predictor = SAM2ImagePredictor(build_sam2(model_cfg, checkpoint))
+
+with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+    predictor.set_image(<your_image>)
+    masks, _, _ = predictor.predict(<input_prompts>)
+```
+
+Please refer to the examples in [image_predictor_example.ipynb](./notebooks/image_predictor_example.ipynb) for static image use cases.
+
+SAM 2 also supports automatic mask generation on images just like SAM. Please see [automatic_mask_generator_example.ipynb](./notebooks/automatic_mask_generator_example.ipynb) for automatic mask generation in images.
+
+### Video prediction
+
+For promptable segmentation and tracking in videos, we provide a video predictor with APIs for example to add prompts and propagate masklets throughout a video. SAM 2 supports video inference on multiple objects and uses an inference state to keep track of the interactions in each video.
+
+```python
+import torch
+from sam2.build_sam import build_sam2_video_predictor
+
+checkpoint = "./checkpoints/sam2_hiera_large.pt"
+model_cfg = "sam2_hiera_l.yaml"
+predictor = build_sam2_video_predictor(model_cfg, checkpoint)
+
+with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+    state = predictor.init_state(<your_video>)
+
+    # add new prompts and instantly get the output on the same frame
+    frame_idx, object_ids, masks = predictor.add_new_points(state, <your prompts>):
+
+    # propagate the prompts to get masklets throughout the video
+    for frame_idx, object_ids, masks in predictor.propagate_in_video(state):
+        ...
+```
+
+Please refer to the examples in [video_predictor_example.ipynb](./notebooks/video_predictor_example.ipynb) for details on how to add prompts, make refinements, and track multiple objects in videos.
+
+## Model Description
+
+|      **Model**       | **Size (M)** |    **Speed (FPS)**     | **SA-V test (J&F)** | **MOSE val (J&F)** | **LVOS v2 (J&F)** |
+| :------------------: | :----------: | :--------------------: | :-----------------: | :----------------: | :---------------: |
+|   sam2_hiera_tiny    |     38.9     |          47.2          |        75.0         |        70.9        |       75.3        |
+|   sam2_hiera_small   |      46      | 43.3 (53.0 compiled\*) |        74.9         |        71.5        |       76.4        |
+| sam2_hiera_base_plus |     80.8     | 34.8 (43.8 compiled\*) |        74.7         |        72.8        |       75.8        |
+|   sam2_hiera_large   |    224.4     | 24.2 (30.2 compiled\*) |        76.0         |        74.6        |       79.8        |
+
+\* Compile the model by setting `compile_image_encoder: True` in the config.
+
+## Segment Anything Video Dataset
+
+See [sav_dataset/README.md](sav_dataset/README.md) for details.
+
+## License
+
+The models are licensed under the [Apache 2.0 license](./LICENSE). Please refer to our research paper for more details on the models.
+
+## Contributing
+
+See [contributing](CONTRIBUTING.md) and the [code of conduct](CODE_OF_CONDUCT.md).
+
+## Contributors
+
+The SAM 2 project was made possible with the help of many contributors (alphabetical):
+
+Karen Bergan, Daniel Bolya, Alex Bosenberg, Kai Brown, Vispi Cassod, Christopher Chedeau, Ida Cheng, Luc Dahlin, Shoubhik Debnath, Rene Martinez Doehner, Grant Gardner, Sahir Gomez, Rishi Godugu, Baishan Guo, Caleb Ho, Andrew Huang, Somya Jain, Bob Kamma, Amanda Kallet, Jake Kinney, Alexander Kirillov, Shiva Koduvayur, Devansh Kukreja, Robert Kuo, Aohan Lin, Parth Malani, Jitendra Malik, Mallika Malhotra, Miguel Martin, Alexander Miller, Sasha Mitts, William Ngan, George Orlin, Joelle Pineau, Kate Saenko, Rodrick Shepard, Azita Shokrpour, David Soofian, Jonathan Torres, Jenny Truong, Sagar Vaze, Meng Wang, Claudette Ward, Pengchuan Zhang.
+
+Third-party code: we use a GPU-based connected component algorithm adapted from [`cc_torch`](https://github.com/zsef123/Connected_components_PyTorch) (with its license in [`LICENSE_cctorch`](./LICENSE_cctorch)) as an optional post-processing step for the mask predictions.
+
+## Citing SAM 2
+
+If you use SAM 2 or the SA-V dataset in your research, please use the following BibTeX entry.
+
+```bibtex
+@article{ravi2024sam2,
+  title={SAM 2: Segment Anything in Images and Videos},
+  author={Ravi, Nikhila and Gabeur, Valentin and Hu, Yuan-Ting and Hu, Ronghang and Ryali, Chaitanya and Ma, Tengyu and Khedr, Haitham and R{\"a}dle, Roman and Rolland, Chloe and Gustafson, Laura and Mintun, Eric and Pan, Junting and Alwala, Kalyan Vasudev and Carion, Nicolas and Wu, Chao-Yuan and Girshick, Ross and Doll{\'a}r, Piotr and Feichtenhofer, Christoph},
+  journal={arXiv preprint},
+  year={2024}
+}
+```
@@ -1,139 +0,0 @@
-<div align="center">
-<img align="left" width="100" height="100" src="https://github.com/user-attachments/assets/1834fc25-42ef-4237-9feb-53a01c137e83" alt="">
-
-# SAMURAI: Adapting Segment Anything Model for Zero-Shot Visual Tracking with Motion-Aware Memory
-
-[Cheng-Yen Yang](https://yangchris11.github.io), [Hsiang-Wei Huang](https://hsiangwei0903.github.io/), [Wenhao Chai](https://rese1f.github.io/), [Zhongyu Jiang](https://zhyjiang.github.io/#/), [Jenq-Neng Hwang](https://people.ece.uw.edu/hwang/)
-
-[Information Processing Lab, University of Washington](https://ipl-uw.github.io/) 
-</div>
-
-
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/samurai-adapting-segment-anything-model-for-1/visual-object-tracking-on-lasot-ext)](https://paperswithcode.com/sota/visual-object-tracking-on-lasot-ext?p=samurai-adapting-segment-anything-model-for-1)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/samurai-adapting-segment-anything-model-for-1/visual-object-tracking-on-got-10k)](https://paperswithcode.com/sota/visual-object-tracking-on-got-10k?p=samurai-adapting-segment-anything-model-for-1)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/samurai-adapting-segment-anything-model-for-1/visual-object-tracking-on-needforspeed)](https://paperswithcode.com/sota/visual-object-tracking-on-needforspeed?p=samurai-adapting-segment-anything-model-for-1)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/samurai-adapting-segment-anything-model-for-1/visual-object-tracking-on-lasot)](https://paperswithcode.com/sota/visual-object-tracking-on-lasot?p=samurai-adapting-segment-anything-model-for-1)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/samurai-adapting-segment-anything-model-for-1/visual-object-tracking-on-otb-2015)](https://paperswithcode.com/sota/visual-object-tracking-on-otb-2015?p=samurai-adapting-segment-anything-model-for-1)
-
-[[Arxiv]](https://arxiv.org/abs/2411.11922) [[Project Page]](https://yangchris11.github.io/samurai/) [[Raw Results]](https://drive.google.com/drive/folders/1ssiDmsC7mw5AiItYQG4poiR1JgRq305y?usp=sharing) 
-
-This repository is the official implementation of SAMURAI: Adapting Segment Anything Model for Zero-Shot Visual Tracking with Motion-Aware Memory
-
-https://github.com/user-attachments/assets/9d368ca7-2e9b-4fed-9da0-d2efbf620d88
-
-All rights are reserved to the copyright owners (TM & © Universal (2019)). This clip is not intended for commercial use and is solely for academic demonstration in a research paper. Original source can be found [here](https://www.youtube.com/watch?v=cwUzUzpG8aM&t=4s).
-
-## Getting Started
-
-#### SAMURAI Installation 
-
-SAM 2 needs to be installed first before use. The code requires `python>=3.10`, as well as `torch>=2.3.1` and `torchvision>=0.18.1`. Please follow the instructions [here](https://github.com/facebookresearch/sam2?tab=readme-ov-file) to install both PyTorch and TorchVision dependencies. You can install **the SAMURAI version** of SAM 2 on a GPU machine using:
-```
-cd sam2
-pip install -e .
-pip install -e ".[notebooks]"
-```
-
-Please see [INSTALL.md](https://github.com/facebookresearch/sam2/blob/main/INSTALL.md) from the original SAM 2 repository for FAQs on potential issues and solutions.
-
-Install other requirements:
-```
-pip install matplotlib==3.7 tikzplotlib jpeg4py opencv-python lmdb pandas scipy loguru
-```
-
-#### SAM 2.1 Checkpoint Download
-
-```
-cd checkpoints && \
-./download_ckpts.sh && \
-cd ..
-```
-
-#### Data Preparation
-
-Please prepare the data in the following format:
-```
-data/LaSOT
-├── airplane/
-│   ├── airplane-1/
-│   │   ├── full_occlusion.txt
-│   │   ├── groundtruth.txt
-│   │   ├── img
-│   │   ├── nlp.txt
-│   │   └── out_of_view.txt
-│   ├── airplane-2/
-│   ├── airplane-3/
-│   ├── ...
-├── basketball
-├── bear
-├── bicycle
-...
-├── training_set.txt
-└── testing_set.txt
-```
-
-#### Main Inference
-```
-python scripts/main_inference.py 
-```
-
-## Demo on Custom Video
-
-To run the demo with your custom video or frame directory, use the following examples:
-
-**Note:** The `.txt` file contains a single line with the bounding box of the first frame in `x,y,w,h` format.
-
-### Input is Video File
-
-```
-python scripts/demo.py --video_path <your_video.mp4> --txt_path <path_to_first_frame_bbox.txt>
-```
-
-### Input is Frame Folder
-```
-# Only JPG images are supported
-python scripts/demo.py --video_path <your_frame_directory> --txt_path <path_to_first_frame_bbox.txt>
-```
-
-## FAQs
-**Question 1:** Does SAMURAI need training? [issue 34](https://github.com/yangchris11/samurai/issues/34)
-
-**Answer 1:** Unlike real-life samurai, the proposed samurai do not require additional training. It is a zero-shot method, we directly use the weights from SAM 2.1 to conduct VOT experiments. Kalman filter is used to estimate the current and future state (bounding box location and scale in our case) of a moving object based on measurements over time, it is a common approach that had been adapt in the field of tracking for a long time which does not requires any training. Please refer to code for more detail.
-
-**Question 2:** Does SAMURAI support streaming input (e.g. webcam)?
-
-**Answer 2:** Not yet. The existing code doesn't support live/streaming video as we inherit most of the codebase from the amazing SAM 2. Some discussion that you might be interested in: facebookresearch/sam2#90, facebookresearch/sam2#388 (comment).
-
-**Question 3:** How to use SAMURAI in longer video?
-
-**Answer 3:** See the discussion from sam2 https://github.com/facebookresearch/sam2/issues/264.
-
-
-## Acknowledgment
-
-SAMURAI is built on top of [SAM 2](https://github.com/facebookresearch/sam2?tab=readme-ov-file) by Meta FAIR.
-
-The VOT evaluation code is modifed from [VOT Toolkit](https://github.com/votchallenge/toolkit) by Luka Čehovin Zajc.
-
-## Citation
-
-Please consider citing our paper and the wonderful `SAM 2` if you found our work interesting and useful.
-```
-@article{ravi2024sam2,
-  title={SAM 2: Segment Anything in Images and Videos},
-  author={Ravi, Nikhila and Gabeur, Valentin and Hu, Yuan-Ting and Hu, Ronghang and Ryali, Chaitanya and Ma, Tengyu and Khedr, Haitham and R{\"a}dle, Roman and Rolland, Chloe and Gustafson, Laura and Mintun, Eric and Pan, Junting and Alwala, Kalyan Vasudev and Carion, Nicolas and Wu, Chao-Yuan and Girshick, Ross and Doll{\'a}r, Piotr and Feichtenhofer, Christoph},
-  journal={arXiv preprint arXiv:2408.00714},
-  url={https://arxiv.org/abs/2408.00714},
-  year={2024}
-}
-
-@misc{yang2024samurai,
-      title={SAMURAI: Adapting Segment Anything Model for Zero-Shot Visual Tracking with Motion-Aware Memory}, 
-      author={Cheng-Yen Yang and Hsiang-Wei Huang and Wenhao Chai and Zhongyu Jiang and Jenq-Neng Hwang},
-      year={2024},
-      eprint={2411.11922},
-      archivePrefix={arXiv},
-      primaryClass={cs.CV},
-      url={https://arxiv.org/abs/2411.11922}, 
-}
-```
@@ -1,4 +0,0 @@
-# Ignore everything in this directory
-*
-# Except this file
-!.gitignore
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
will ye	2111d9c52c	Fix demos for CPU inference (#104 )	2025-05-27 00:24:30 +08:00
will ye	75aaf0c3ae	Change default output dir for HF demo (#105 )	2025-05-27 00:24:17 +08:00
Embodied Learner	c5780dabeb	feat:add grounded_sam2_tracking_camera_with_continuous_id.py (closes … (#97 ) * feat:add grounded_sam2_tracking_camera_with_continuous_id.py (closes #74) * update README	2025-05-08 11:02:33 +08:00
Sami Haidar	7fec804683	Pinned setuptools in Dockerfile (#99 ) Co-authored-by: Sami Haidar Wehbe <sami@autoenhance.ai>	2025-05-08 11:02:04 +08:00
rentainhe	9412a16276	update DINO-X api to V2	2025-04-21 01:06:01 +08:00
rentainhe	d49257700a	update DINO-X api usage to dds v2	2025-04-20 01:04:26 +08:00
rentainhe	3c5a4136d4	update DINO-X api usage to dds v2	2025-04-20 00:38:38 +08:00
Andrew Choi	8238557f52	Add torch2.6 support for ms_deform_attn_cuda (#94 )	2025-04-18 00:38:51 +08:00
Reuben Feinman	0bc3970292	update setuptools build requirement to fix build error (#91 )	2025-03-24 22:26:04 +08:00
rentainhe	dd4c5141b7	update README	2024-12-09 11:35:16 +08:00
rentainhe	53dcb96024	Merge branch 'main' of github.com:IDEA-Research/Grounded-SAM-2 into main	2024-12-05 17:44:07 +08:00
rentainhe	87b1cb4e79	update dds-cloudapi-sdk to 0.3.3	2024-12-05 17:43:56 +08:00
Ren Tianhe	411f157507	Fix typo	2024-12-04 12:49:32 +08:00
rentainhe	de62b7fb0b	add dino-x sam2 tracking demo	2024-12-02 21:01:55 +08:00
rentainhe	779147bc48	support DINO-X with SAM 2 for detection and segmentation	2024-12-02 19:43:59 +08:00
Ren Tianhe	13d23efc55	Add DINO-X arXiv link	2024-12-02 16:21:45 +08:00
John Heilman	7262a579d7	Fixed Type on MORE_DETAILED_CAPTION (#68 )	2024-11-03 11:09:14 +08:00
rentainhe	91c69d763f	Merge branch 'main' of github.com:IDEA-Research/Grounded-SAM-2 into main	2024-10-31 15:50:31 +08:00
rentainhe	1aec7ded16	support box threshold in GD 1.5 demos	2024-10-31 15:50:14 +08:00
Susan Shen	bf57b3086c	fix: zero object detection error (#64 ) * update dockerfile * fix: zero object detection error * fix: zero object detection error	2024-10-30 20:38:30 +08:00
rentainhe	e537a1e763	refine useless code	2024-10-30 11:25:19 +08:00
rentainhe	f776d247b9	refine README	2024-10-29 10:43:26 +08:00
rentainhe	5a44fe1889	refine README	2024-10-29 10:39:57 +08:00
rentainhe	fb875add87	refine README	2024-10-29 10:39:29 +08:00
rentainhe	3ef929186b	refine README	2024-10-29 10:34:51 +08:00
rentainhe	d608a9adad	update README for SAHI inference	2024-10-24 17:24:12 +08:00
rentainhe	041bb0bfa4	support slice inference on gd1.5 sam2 demo	2024-10-24 16:57:04 +08:00
rentainhe	be550a93b1	update running info	2024-10-23 11:27:18 +08:00
rentainhe	5abc04b713	update README	2024-10-18 16:21:32 +08:00
rentainhe	20dd89b60e	fix hyper link for florence-2 demo	2024-10-16 15:26:13 +08:00
rentainhe	5c6166fb7e	refine config	2024-10-15 12:17:53 +08:00
Ren Tianhe	82e503604f	[New Feature] Support SAM 2.1 (#59 ) * support sam 2.1 * refine config path and ckpt path * update README	2024-10-10 14:55:50 +08:00
kwikwag	e899ad99e8	feat: grounded_sam2_hf_mode cli arguments (#52 )	2024-09-24 20:33:11 +08:00
Susan Shen	81ac531aa9	update dockerfile (#47 )	2024-09-06 20:44:58 +08:00
rentainhe	379e35cb40	support custom video tracking demo with local gd1.0 model	2024-09-05 14:57:17 +08:00
rentainhe	834de442cb	update README for results format	2024-09-04 01:28:18 +08:00
Ren Tianhe	5ee7526b79	Merge pull request #42 from Greywan/dev fix:fixed interruptions when there is no mask result for the current …	2024-09-03 16:07:00 +08:00
wanjunhui1	f3d381901a	fix:fixed interruptions when there is no mask result for the current frame	2024-09-03 15:32:38 +08:00
Ren Tianhe	be1fa537bb	Merge pull request #40 from IDEA-Research/dump_json_results [Update] Support automatically dumping json results in image demos	2024-08-31 20:58:49 +08:00
rentainhe	daf5bb3f97	support dump results in local demo	2024-08-31 20:58:37 +08:00
rentainhe	e0daab208a	support dump results in local demo	2024-08-31 20:55:49 +08:00
rentainhe	a99354bb25	add dump results to hf model demo	2024-08-31 20:40:59 +08:00
rentainhe	4f3adf3222	support dump results in 1.5 image demo	2024-08-31 20:22:17 +08:00
rentainhe	5d27e4f4f4	refine assert info typo	2024-08-31 12:13:53 +08:00
rentainhe	6e0ddadf7c	update to latest SAM 2	2024-08-21 18:11:44 +08:00
rentainhe	35efb4a5cb	refine README	2024-08-20 14:49:19 +08:00
rentainhe	6768682482	update News	2024-08-20 14:18:34 +08:00
rentainhe	a67a213e90	add docs for multi class detection using Florence-2 open-vocab func	2024-08-20 14:15:37 +08:00
rentainhe	da69bf587e	support auto label pipeline with florence-2	2024-08-19 16:37:50 +08:00
rentainhe	0e4c00c2b0	refine file name	2024-08-19 00:29:23 +08:00
rentainhe	899f7ccb68	Merge branch 'main' of github.com:IDEA-Research/Grounded-SAM-2 into main	2024-08-19 00:23:01 +08:00
rentainhe	afa91ca407	add open-vocab demo	2024-08-19 00:22:47 +08:00
Ren Tianhe	11e9eb35e0	Merge pull request #18 from ShuoShenDe/main feat: continuous_id_plus	2024-08-16 15:45:06 +08:00
SusanSHEN	245fc7206b	feat: continuous_id_plus	2024-08-16 01:46:41 +02:00
rentainhe	5f886743d9	add notes	2024-08-16 02:13:49 +08:00
rentainhe	122c46d823	add referring demo	2024-08-16 02:12:41 +08:00
rentainhe	1fc4d469ab	support more demos with florence-2	2024-08-15 02:13:30 +08:00
Ren Tianhe	35541890cc	Update README.md	2024-08-14 17:28:42 +08:00
rentainhe	303bec0406	update content	2024-08-14 16:00:12 +08:00
rentainhe	14d6e504b9	start supporting pipeline with florence-2	2024-08-14 15:21:03 +08:00
Ren Tianhe	0df4f62ddc	Merge pull request #12 from MorganTitcher/main fix: fixed typo in class name and in example notebooks	2024-08-14 14:47:17 +08:00
Ren Tianhe	9b8b41f04b	Merge pull request #17 from WFram/main Add building gdino into Dockerfile	2024-08-14 10:16:05 +08:00
WFram	09d0ceb928	Add building gdino into Dockerfile	2024-08-14 02:07:00 +03:00
rentainhe	37ea3c4171	update news	2024-08-12 11:31:16 +08:00
rentainhe	293c11ff63	refine image	2024-08-12 09:56:44 +08:00
rentainhe	e3ae4dcc29	refine image	2024-08-12 09:54:31 +08:00
rentainhe	bfc16d3089	refine image	2024-08-12 09:50:03 +08:00
Morgan Titcher	f79b544fe6	Merge pull request #1 from MorganTitcher/quick-fix-typo fixed typo in class name and in example notebooks	2024-08-11 16:45:14 -04:00
Morgan Titcher	d5b27842a2	fixed typo in class name and in example notebooks	2024-08-11 20:26:26 +00:00
Ren Tianhe	4e7fe58609	Merge pull request #9 from ShuoShenDe/main fix: create dockerfile and makefile #8	2024-08-10 11:21:42 +08:00
SusanSHEN	0f48470ec3	update README	2024-08-09 22:39:52 +02:00
SusanSHEN	0ee1d39509	fix:add makefile and dockerfile	2024-08-09 22:23:01 +02:00
SusanSHEN	f211f7712f	Merge branch 'main' of https://github.com/ShuoShenDe/Grounded-SAM-2 into main	2024-08-09 20:05:01 +02:00
rentainhe	b5ebc653c5	support 1.5 find new object demo	2024-08-10 01:07:07 +08:00
SusanSHEN	cbc3b2fb8b	Merge branch 'main' of https://github.com/ShuoShenDe/Grounded-SAM-2 into main	2024-08-09 15:38:43 +02:00
Shuo Shen	b2b81980f8	fix:update README	2024-08-09 15:25:13 +02:00
rentainhe	0247a8f102	refine citation	2024-08-09 19:28:19 +08:00
rentainhe	91914958bd	refine video	2024-08-09 19:23:55 +08:00
rentainhe	4e0285a97b	refine video	2024-08-09 19:19:06 +08:00
rentainhe	cabbad473b	update visualization func	2024-08-09 19:14:20 +08:00
rentainhe	ccacb31e59	update README	2024-08-09 10:10:17 +08:00
rentainhe	2ffe804ad9	update README	2024-08-09 09:47:05 +08:00
rentainhe	ae99dca27f	refine video link	2024-08-09 09:38:01 +08:00
rentainhe	3a0089e7cb	refine README	2024-08-09 09:36:03 +08:00
Ren Tianhe	0ba553bf88	Merge pull request #6 from ShuoShenDe/main feat: create grounded_sam2_tracking_demo_with_continuous_id.py	2024-08-09 09:17:01 +08:00
bd8090	f22e6bde05	fix:update README	2024-08-09 02:54:02 +02:00
bd8090	d0c10627be	fix:update README	2024-08-09 02:51:41 +02:00
bd8090	b99e0f7ee4	update README	2024-08-09 02:41:29 +02:00
bd8090	df626551c4	feat: add grounded_sam2_tracking_demo_with_continuous_id.py and test data	2024-08-09 02:33:24 +02:00
rentainhe	80676e866b	fix mask shape bug	2024-08-09 01:54:40 +08:00
rentainhe	9c5786fc09	update	2024-08-09 01:36:12 +08:00
rentainhe	fda8e8af23	refine Grounded-SAM-2 to Grounded SAM 2	2024-08-09 01:35:38 +08:00
Ren Tianhe	fd89106349	Update README.md	2024-08-08 18:00:02 +08:00
rentainhe	04ad096725	support more prompt in simple demo	2024-08-08 12:26:59 +08:00
rentainhe	223df6c912	add new tracking pipeline vis	2024-08-08 12:15:58 +08:00
rentainhe	87dc52d968	refine README	2024-08-08 12:15:42 +08:00
rentainhe	21c1002e66	Merge branch 'main' of github.com:IDEA-Research/Grounded-SAM-2 into main	2024-08-08 12:03:46 +08:00
rentainhe	077064c365	update to the latest sam2 version and support box prompts in video tracking	2024-08-08 12:03:29 +08:00
Ren Tianhe	c3159ee2f3	Merge pull request #2 from eltociear/patch-1 docs: update README.md	2024-08-08 00:14:50 +08:00
Ikko Eltociear Ashimine	837d871442	docs: update README.md initilize -> initialize	2024-08-08 01:04:53 +09:00
rentainhe	96cbab92e0	support gd1.0 tracking demo with custom input	2024-08-07 17:14:46 +08:00
rentainhe	ce0fc19c98	refine pipeline vis	2024-08-07 16:51:12 +08:00
rentainhe	a1b3cc3af6	refine README	2024-08-07 16:46:00 +08:00
rentainhe	31ec9420d8	refine README	2024-08-07 16:45:03 +08:00
rentainhe	37cf27cfe3	support mask prompt for video tracking	2024-08-07 16:42:49 +08:00
rentainhe	7c0995e9c3	refine file name	2024-08-07 15:51:24 +08:00
Ren Tianhe	7303af7250	Update Intro Video	2024-08-07 10:30:18 +08:00
Ren Tianhe	aa7913cd6c	Update README.md	2024-08-06 22:29:50 +08:00
rentainhe	858d955647	refine video	2024-08-06 18:30:57 +08:00
rentainhe	ad62ac6c09	refine README	2024-08-06 17:34:13 +08:00
rentainhe	e990003497	add demo video	2024-08-06 17:26:31 +08:00
rentainhe	231d213c58	support custom video input and tracking	2024-08-06 17:11:55 +08:00
rentainhe	731955be0f	update README	2024-08-06 03:08:06 +08:00
rentainhe	bb56590c47	add track pipeline vis	2024-08-06 03:03:23 +08:00
rentainhe	0563b4e368	refine README	2024-08-06 02:36:47 +08:00
rentainhe	6915725120	upgrade supervision to 0.22.0 and refine custom API usage	2024-08-06 01:59:27 +08:00
rentainhe	ed4c128a4e	update contents	2024-08-05 16:09:01 +08:00
rentainhe	2cdd3f2d92	add tracking demo with gd 1.5	2024-08-05 16:05:31 +08:00
rentainhe	41640f4add	refine installation	2024-08-02 21:21:39 +08:00
rentainhe	aae0fbc776	refine citation	2024-08-02 21:13:54 +08:00
rentainhe	b31da6037a	refine video file	2024-08-02 21:13:00 +08:00
rentainhe	f1a3c0a4f3	refine video file	2024-08-02 21:11:48 +08:00
rentainhe	20eb2ba25d	refine video file	2024-08-02 21:10:28 +08:00
rentainhe	a2d637f1d9	refine video link	2024-08-02 20:36:22 +08:00
rentainhe	aa84b67a69	add demo video	2024-08-02 18:36:16 +08:00
rentainhe	1d018ceb55	add tracking demo and support video dump	2024-08-02 17:06:32 +08:00
rentainhe	bf450b6b41	add grounded sam 2 tracking demo	2024-08-02 15:46:31 +08:00
rentainhe	8d88e75aa6	refine API token	2024-08-01 21:31:51 +08:00
rentainhe	d27829ff17	support 1.5 image demo	2024-08-01 21:30:56 +08:00
rentainhe	f5b99eea3d	support gdino local model (load local ckpt)	2024-08-01 17:58:42 +08:00
rentainhe	1dacb47840	support gsam2 image predictor model	2024-08-01 17:05:01 +08:00
Ren Tianhe	72501fecf8	Initial commit	2024-08-01 14:56:29 +08:00