init commit of samurai

2024-11-19 22:12:54 -08:00
parent f65f4ba181
commit c17e4cecc0
679 changed files with 123982 additions and 0 deletions
--- a/lib/test/tracker/init.py
+++ b/lib/test/tracker/init.py
--- a/lib/test/tracker/artrack.py
+++ b/lib/test/tracker/artrack.py
@@ -0,0 +1,225 @@
+import math
+
+from lib.models.artrack import build_artrack
+from lib.test.tracker.basetracker import BaseTracker
+import torch
+
+from lib.test.tracker.vis_utils import gen_visualization
+from lib.test.utils.hann import hann2d
+from lib.train.data.processing_utils import sample_target
+# for debug
+import cv2
+import os
+
+from lib.test.tracker.data_utils import Preprocessor
+from lib.utils.box_ops import clip_box
+from lib.utils.ce_utils import generate_mask_cond
+import random
+
+class RandomErasing(object):
+    def __init__(self, EPSILON=0.5, sl=0.02, sh=0.33, r1=0.3, mean=[0.4914, 0.4822, 0.4465]):
+        self.EPSILON = EPSILON
+        self.mean = mean
+        self.sl = sl
+        self.sh = sh
+        self.r1 = r1
+
+    def __call__(self, img):
+
+        if random.uniform(0, 1) > self.EPSILON:
+            return img
+
+        for attempt in range(100):
+            print(img.size())
+            area = img.size()[1] * img.size()[2]
+
+            target_area = random.uniform(self.sl, self.sh) * area
+            aspect_ratio = random.uniform(self.r1, 1 / self.r1)
+
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if w < img.size()[2] and h < img.size()[1]:
+                x1 = random.randint(0, img.size()[1] - h)
+                y1 = random.randint(0, img.size()[2] - w)
+                if img.size()[0] == 3:
+                    # img[0, x1:x1+h, y1:y1+w] = random.uniform(0, 1)
+                    # img[1, x1:x1+h, y1:y1+w] = random.uniform(0, 1)
+                    # img[2, x1:x1+h, y1:y1+w] = random.uniform(0, 1)
+                    img[0, x1:x1 + h, y1:y1 + w] = self.mean[0]
+                    img[1, x1:x1 + h, y1:y1 + w] = self.mean[1]
+                    img[2, x1:x1 + h, y1:y1 + w] = self.mean[2]
+                    # img[:, x1:x1+h, y1:y1+w] = torch.from_numpy(np.random.rand(3, h, w))
+                else:
+                    img[0, x1:x1 + h, y1:y1 + w] = self.mean[1]
+                    # img[0, x1:x1+h, y1:y1+w] = torch.from_numpy(np.random.rand(1, h, w))
+                return img
+
+        return img
+
+
+class ARTrack(BaseTracker):
+    def __init__(self, params, dataset_name):
+        super(ARTrack, self).__init__(params)
+        network = build_artrack(params.cfg, training=False)
+        print(self.params.checkpoint)
+        network.load_state_dict(torch.load(self.params.checkpoint, map_location='cpu')['net'], strict=True)
+        self.cfg = params.cfg
+        self.bins = self.cfg.MODEL.BINS
+        self.network = network.cuda()
+        self.network.eval()
+        self.preprocessor = Preprocessor()
+        self.state = None
+        self.range = self.cfg.MODEL.RANGE
+
+        self.feat_sz = self.cfg.TEST.SEARCH_SIZE // self.cfg.MODEL.BACKBONE.STRIDE
+        # motion constrain
+        self.output_window = hann2d(torch.tensor([self.feat_sz, self.feat_sz]).long(), centered=True).cuda()
+
+        # for debug
+        self.debug = params.debug
+        self.use_visdom = params.debug
+        self.frame_id = 0
+        self.erase = RandomErasing()
+        if self.debug:
+            if not self.use_visdom:
+                self.save_dir = "debug"
+                if not os.path.exists(self.save_dir):
+                    os.makedirs(self.save_dir)
+            else:
+                # self.add_hook()
+                self._init_visdom(None, 1)
+        # for save boxes from all queries
+        self.save_all_boxes = params.save_all_boxes
+        self.z_dict1 = {}
+
+    def initialize(self, image, info: dict):
+        # forward the template once
+
+        z_patch_arr, resize_factor, z_amask_arr = sample_target(image, info['init_bbox'], self.params.template_factor,
+                                                    output_sz=self.params.template_size)#output_sz=self.params.template_size
+        self.z_patch_arr = z_patch_arr
+        template = self.preprocessor.process(z_patch_arr, z_amask_arr)
+        with torch.no_grad():
+            self.z_dict1 = template
+
+        self.box_mask_z = None
+        #if self.cfg.MODEL.BACKBONE.CE_LOC:
+        #    template_bbox = self.transform_bbox_to_crop(info['init_bbox'], resize_factor,
+        #                                                template.tensors.device).squeeze(1)
+        #    self.box_mask_z = generate_mask_cond(self.cfg, 1, template.tensors.device, template_bbox)
+
+        # save states
+        self.state = info['init_bbox']
+        self.frame_id = 0
+        if self.save_all_boxes:
+            '''save all predicted boxes'''
+            all_boxes_save = info['init_bbox'] * self.cfg.MODEL.NUM_OBJECT_QUERIES
+            return {"all_boxes": all_boxes_save}
+
+    def track(self, image, info: dict = None):
+        magic_num = (self.range - 1) * 0.5
+        H, W, _ = image.shape
+        self.frame_id += 1
+        x_patch_arr, resize_factor, x_amask_arr = sample_target(image, self.state, self.params.search_factor,
+                                                                output_sz=self.params.search_size)  # (x1, y1, w, h)
+        search = self.preprocessor.process(x_patch_arr, x_amask_arr)
+
+
+        with torch.no_grad():
+            x_dict = search
+            # merge the template and the search
+            # run the transformer
+            out_dict = self.network.forward(
+                template=self.z_dict1.tensors, search=x_dict.tensors)
+
+        # add hann windows
+        # pred_score_map = out_dict['score_map']
+        # response = self.output_window * pred_score_map
+        # pred_boxes = self.network.box_head.cal_bbox(response, out_dict['size_map'], out_dict['offset_map'])
+        # pred_boxes = pred_boxes.view(-1, 4)
+
+        pred_boxes = out_dict['seqs'][:, 0:4] / (self.bins - 1) - magic_num
+        pred_boxes = pred_boxes.view(-1, 4).mean(dim=0)
+        pred_new = pred_boxes
+        pred_new[2] = pred_boxes[2] - pred_boxes[0]
+        pred_new[3] = pred_boxes[3] - pred_boxes[1]
+        pred_new[0] = pred_boxes[0] + pred_boxes[2]/2
+        pred_new[1] = pred_boxes[1] + pred_boxes[3]/2
+
+        pred_boxes = (pred_new * self.params.search_size / resize_factor).tolist()
+
+        # Baseline: Take the mean of all pred boxes as the final result
+        #pred_box = (pred_boxes.mean(
+        #    dim=0) * self.params.search_size / resize_factor).tolist()  # (cx, cy, w, h) [0,1]
+        # get the final box result
+        self.state = clip_box(self.map_box_back(pred_boxes, resize_factor), H, W, margin=10)
+
+        # for debug
+        if self.debug:
+            if not self.use_visdom:
+                x1, y1, w, h = self.state
+                image_BGR = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+                cv2.rectangle(image_BGR, (int(x1),int(y1)), (int(x1+w),int(y1+h)), color=(0,0,255), thickness=2)
+                save_path = os.path.join(self.save_dir, "%04d.jpg" % self.frame_id)
+                cv2.imwrite(save_path, image_BGR)
+            else:
+                self.visdom.register((image, info['gt_bbox'].tolist(), self.state), 'Tracking', 1, 'Tracking')
+
+                self.visdom.register(torch.from_numpy(x_patch_arr).permute(2, 0, 1), 'image', 1, 'search_region')
+                self.visdom.register(torch.from_numpy(self.z_patch_arr).permute(2, 0, 1), 'image', 1, 'template')
+                self.visdom.register(pred_score_map.view(self.feat_sz, self.feat_sz), 'heatmap', 1, 'score_map')
+                self.visdom.register((pred_score_map * self.output_window).view(self.feat_sz, self.feat_sz), 'heatmap', 1, 'score_map_hann')
+
+                if 'removed_indexes_s' in out_dict and out_dict['removed_indexes_s']:
+                    removed_indexes_s = out_dict['removed_indexes_s']
+                    removed_indexes_s = [removed_indexes_s_i.cpu().numpy() for removed_indexes_s_i in removed_indexes_s]
+                    masked_search = gen_visualization(x_patch_arr, removed_indexes_s)
+                    self.visdom.register(torch.from_numpy(masked_search).permute(2, 0, 1), 'image', 1, 'masked_search')
+
+                while self.pause_mode:
+                    if self.step:
+                        self.step = False
+                        break
+
+        if self.save_all_boxes:
+            '''save all predictions'''
+            all_boxes = self.map_box_back_batch(pred_boxes * self.params.search_size / resize_factor, resize_factor)
+            all_boxes_save = all_boxes.view(-1).tolist()  # (4N, )
+            return {"target_bbox": self.state,
+                    "all_boxes": all_boxes_save}
+        else:
+            return {"target_bbox": self.state}
+
+    def map_box_back(self, pred_box: list, resize_factor: float):
+        cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[1] + 0.5 * self.state[3]
+        cx, cy, w, h = pred_box
+        half_side = 0.5 * self.params.search_size / resize_factor
+        cx_real = cx + (cx_prev - half_side)
+        cy_real = cy + (cy_prev - half_side)
+        #cx_real = cx + cx_prev
+        #cy_real = cy + cy_prev
+        return [cx_real - 0.5 * w, cy_real - 0.5 * h, w, h]
+
+    def map_box_back_batch(self, pred_box: torch.Tensor, resize_factor: float):
+        cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[1] + 0.5 * self.state[3]
+        cx, cy, w, h = pred_box.unbind(-1) # (N,4) --> (N,)
+        half_side = 0.5 * self.params.search_size / resize_factor
+        cx_real = cx + (cx_prev - half_side)
+        cy_real = cy + (cy_prev - half_side)
+        return torch.stack([cx_real - 0.5 * w, cy_real - 0.5 * h, w, h], dim=-1)
+
+    def add_hook(self):
+        conv_features, enc_attn_weights, dec_attn_weights = [], [], []
+
+        for i in range(12):
+            self.network.backbone.blocks[i].attn.register_forward_hook(
+                # lambda self, input, output: enc_attn_weights.append(output[1])
+                lambda self, input, output: enc_attn_weights.append(output[1])
+            )
+
+        self.enc_attn_weights = enc_attn_weights
+
+
+def get_tracker_class():
+    return ARTrack
--- a/lib/test/tracker/artrack_seq.py
+++ b/lib/test/tracker/artrack_seq.py
@@ -0,0 +1,209 @@
+import math
+
+from lib.models.artrack_seq import build_artrack_seq
+from lib.test.tracker.basetracker import BaseTracker
+import torch
+
+from lib.test.tracker.vis_utils import gen_visualization
+from lib.test.utils.hann import hann2d
+from lib.train.data.processing_utils import sample_target, transform_image_to_crop
+# for debug
+import cv2
+import os
+
+from lib.test.tracker.data_utils import Preprocessor
+from lib.utils.box_ops import clip_box
+from lib.utils.ce_utils import generate_mask_cond
+
+
+class ARTrackSeq(BaseTracker):
+    def __init__(self, params, dataset_name):
+        super(ARTrackSeq, self).__init__(params)
+        network = build_artrack_seq(params.cfg, training=False)
+        print(self.params.checkpoint)
+        network.load_state_dict(torch.load(self.params.checkpoint, map_location='cpu')['net'], strict=True)
+        self.cfg = params.cfg
+        self.bins = self.cfg.MODEL.BINS
+        self.network = network.cuda()
+        self.network.eval()
+        self.preprocessor = Preprocessor()
+        self.state = None
+
+        self.feat_sz = self.cfg.TEST.SEARCH_SIZE // self.cfg.MODEL.BACKBONE.STRIDE
+        # motion constrain
+        self.output_window = hann2d(torch.tensor([self.feat_sz, self.feat_sz]).long(), centered=True).cuda()
+
+        # for debug
+        self.debug = params.debug
+        self.use_visdom = params.debug
+        self.frame_id = 0
+        if self.debug:
+            if not self.use_visdom:
+                self.save_dir = "debug"
+                if not os.path.exists(self.save_dir):
+                    os.makedirs(self.save_dir)
+            else:
+                # self.add_hook()
+                self._init_visdom(None, 1)
+        # for save boxes from all queries
+        self.save_all_boxes = params.save_all_boxes
+        self.z_dict1 = {}
+        self.store_result = None
+        self.save_all = 7
+        self.x_feat = None
+        self.update = None
+        self.update_threshold = 5.0
+        self.update_intervals = 1
+
+    def initialize(self, image, info: dict):
+        # forward the template once
+        self.x_feat = None
+
+        z_patch_arr, resize_factor, z_amask_arr = sample_target(image, info['init_bbox'], self.params.template_factor,
+                                                                output_sz=self.params.template_size)  # output_sz=self.params.template_size
+        self.z_patch_arr = z_patch_arr
+        template = self.preprocessor.process(z_patch_arr, z_amask_arr)
+        with torch.no_grad():
+            self.z_dict1 = template
+
+        self.box_mask_z = None
+        # if self.cfg.MODEL.BACKBONE.CE_LOC:
+        #    template_bbox = self.transform_bbox_to_crop(info['init_bbox'], resize_factor,
+        #                                                template.tensors.device).squeeze(1)
+        #    self.box_mask_z = generate_mask_cond(self.cfg, 1, template.tensors.device, template_bbox)
+
+        # save states
+        self.state = info['init_bbox']
+        self.store_result = [info['init_bbox'].copy()]
+        for i in range(self.save_all - 1):
+            self.store_result.append(info['init_bbox'].copy())
+        self.frame_id = 0
+        self.update = None
+        if self.save_all_boxes:
+            '''save all predicted boxes'''
+            all_boxes_save = info['init_bbox'] * self.cfg.MODEL.NUM_OBJECT_QUERIES
+            return {"all_boxes": all_boxes_save}
+
+    def track(self, image, info: dict = None):
+        H, W, _ = image.shape
+        self.frame_id += 1
+        x_patch_arr, resize_factor, x_amask_arr = sample_target(image, self.state, self.params.search_factor,
+                                                                output_sz=self.params.search_size)  # (x1, y1, w, h)
+        for i in range(len(self.store_result)):
+            box_temp = self.store_result[i].copy()
+            box_out_i = transform_image_to_crop(torch.Tensor(self.store_result[i]), torch.Tensor(self.state),
+                                                resize_factor,
+                                                torch.Tensor([self.cfg.TEST.SEARCH_SIZE, self.cfg.TEST.SEARCH_SIZE]),
+                                                normalize=True)
+            box_out_i[2] = box_out_i[2] + box_out_i[0]
+            box_out_i[3] = box_out_i[3] + box_out_i[1]
+            box_out_i = box_out_i.clamp(min=-0.5, max=1.5)
+            box_out_i = (box_out_i + 0.5) * (self.bins - 1)
+            if i == 0:
+                seqs_out = box_out_i
+            else:
+                seqs_out = torch.cat((seqs_out, box_out_i), dim=-1)
+        seqs_out = seqs_out.unsqueeze(0)
+        search = self.preprocessor.process(x_patch_arr, x_amask_arr)
+        with torch.no_grad():
+            x_dict = search
+            # merge the template and the search
+            # run the transformer
+            out_dict = self.network.forward(
+                template=self.z_dict1.tensors, search=x_dict.tensors,
+                seq_input=seqs_out, stage="sequence", search_feature=self.x_feat, update=None)
+
+        self.x_feat = out_dict['x_feat']
+
+        pred_boxes = out_dict['seqs'][:, 0:4] / (self.bins - 1) - 0.5
+        pred_boxes = pred_boxes.view(-1, 4).mean(dim=0)
+        pred_new = pred_boxes
+        pred_new[2] = pred_boxes[2] - pred_boxes[0]
+        pred_new[3] = pred_boxes[3] - pred_boxes[1]
+        pred_new[0] = pred_boxes[0] + pred_new[2] / 2
+        pred_new[1] = pred_boxes[1] + pred_new[3] / 2
+        pred_boxes = (pred_new * self.params.search_size / resize_factor).tolist()
+
+        # Baseline: Take the mean of all pred boxes as the final result
+        # pred_box = (pred_boxes.mean(
+        #    dim=0) * self.params.search_size / resize_factor).tolist()  # (cx, cy, w, h) [0,1]
+        # get the final box result
+        self.state = clip_box(self.map_box_back(pred_boxes, resize_factor), H, W, margin=10)
+        if len(self.store_result) < self.save_all:
+            self.store_result.append(self.state.copy())
+        else:
+            for i in range(self.save_all):
+                if i != self.save_all - 1:
+                    self.store_result[i] = self.store_result[i + 1]
+                else:
+                    self.store_result[i] = self.state.copy()
+
+        # for debug
+        if self.debug:
+            if not self.use_visdom:
+                x1, y1, w, h = self.state
+                image_BGR = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+                cv2.rectangle(image_BGR, (int(x1), int(y1)), (int(x1 + w), int(y1 + h)), color=(0, 0, 255), thickness=2)
+                save_path = os.path.join(self.save_dir, "%04d.jpg" % self.frame_id)
+                cv2.imwrite(save_path, image_BGR)
+            else:
+                self.visdom.register((image, info['gt_bbox'].tolist(), self.state), 'Tracking', 1, 'Tracking')
+
+                self.visdom.register(torch.from_numpy(x_patch_arr).permute(2, 0, 1), 'image', 1, 'search_region')
+                self.visdom.register(torch.from_numpy(self.z_patch_arr).permute(2, 0, 1), 'image', 1, 'template')
+                self.visdom.register(pred_score_map.view(self.feat_sz, self.feat_sz), 'heatmap', 1, 'score_map')
+                self.visdom.register((pred_score_map * self.output_window).view(self.feat_sz, self.feat_sz), 'heatmap',
+                                     1, 'score_map_hann')
+
+                if 'removed_indexes_s' in out_dict and out_dict['removed_indexes_s']:
+                    removed_indexes_s = out_dict['removed_indexes_s']
+                    removed_indexes_s = [removed_indexes_s_i.cpu().numpy() for removed_indexes_s_i in removed_indexes_s]
+                    masked_search = gen_visualization(x_patch_arr, removed_indexes_s)
+                    self.visdom.register(torch.from_numpy(masked_search).permute(2, 0, 1), 'image', 1, 'masked_search')
+
+                while self.pause_mode:
+                    if self.step:
+                        self.step = False
+                        break
+
+        if self.save_all_boxes:
+            '''save all predictions'''
+            all_boxes = self.map_box_back_batch(pred_boxes * self.params.search_size / resize_factor, resize_factor)
+            all_boxes_save = all_boxes.view(-1).tolist()  # (4N, )
+            return {"target_bbox": self.state,
+                    "all_boxes": all_boxes_save}
+        else:
+            return {"target_bbox": self.state}
+
+    def map_box_back(self, pred_box: list, resize_factor: float):
+        cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[1] + 0.5 * self.state[3]
+        cx, cy, w, h = pred_box
+        half_side = 0.5 * self.params.search_size / resize_factor
+        cx_real = cx + (cx_prev - half_side)
+        cy_real = cy + (cy_prev - half_side)
+        # cx_real = cx + cx_prev
+        # cy_real = cy + cy_prev
+        return [cx_real - 0.5 * w, cy_real - 0.5 * h, w, h]
+
+    def map_box_back_batch(self, pred_box: torch.Tensor, resize_factor: float):
+        cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[1] + 0.5 * self.state[3]
+        cx, cy, w, h = pred_box.unbind(-1)  # (N,4) --> (N,)
+        half_side = 0.5 * self.params.search_size / resize_factor
+        cx_real = cx + (cx_prev - half_side)
+        cy_real = cy + (cy_prev - half_side)
+        return torch.stack([cx_real - 0.5 * w, cy_real - 0.5 * h, w, h], dim=-1)
+
+    def add_hook(self):
+        conv_features, enc_attn_weights, dec_attn_weights = [], [], []
+
+        for i in range(12):
+            self.network.backbone.blocks[i].attn.register_forward_hook(
+                # lambda self, input, output: enc_attn_weights.append(output[1])
+                lambda self, input, output: enc_attn_weights.append(output[1])
+            )
+
+        self.enc_attn_weights = enc_attn_weights
+
+
+def get_tracker_class():
+    return ARTrackSeq
--- a/lib/test/tracker/basetracker.py
+++ b/lib/test/tracker/basetracker.py
@@ -0,0 +1,89 @@
+import time
+
+import torch
+from _collections import OrderedDict
+
+from lib.train.data.processing_utils import transform_image_to_crop
+from lib.vis.visdom_cus import Visdom
+
+
+class BaseTracker:
+    """Base class for all trackers."""
+
+    def __init__(self, params):
+        self.params = params
+        self.visdom = None
+
+    def predicts_segmentation_mask(self):
+        return False
+
+    def initialize(self, image, info: dict) -> dict:
+        """Overload this function in your tracker. This should initialize the model."""
+        raise NotImplementedError
+
+    def track(self, image, info: dict = None) -> dict:
+        """Overload this function in your tracker. This should track in the frame and update the model."""
+        raise NotImplementedError
+
+    def visdom_draw_tracking(self, image, box, segmentation=None):
+        if isinstance(box, OrderedDict):
+            box = [v for k, v in box.items()]
+        else:
+            box = (box,)
+        if segmentation is None:
+            self.visdom.register((image, *box), 'Tracking', 1, 'Tracking')
+        else:
+            self.visdom.register((image, *box, segmentation), 'Tracking', 1, 'Tracking')
+
+    def transform_bbox_to_crop(self, box_in, resize_factor, device, box_extract=None, crop_type='template'):
+        # box_in: list [x1, y1, w, h], not normalized
+        # box_extract: same as box_in
+        # out bbox: Torch.tensor [1, 1, 4], x1y1wh, normalized
+        if crop_type == 'template':
+            crop_sz = torch.Tensor([self.params.template_size, self.params.template_size])
+        elif crop_type == 'search':
+            crop_sz = torch.Tensor([self.params.search_size, self.params.search_size])
+        else:
+            raise NotImplementedError
+
+        box_in = torch.tensor(box_in)
+        if box_extract is None:
+            box_extract = box_in
+        else:
+            box_extract = torch.tensor(box_extract)
+        template_bbox = transform_image_to_crop(box_in, box_extract, resize_factor, crop_sz, normalize=True)
+        template_bbox = template_bbox.view(1, 1, 4).to(device)
+
+        return template_bbox
+
+    def _init_visdom(self, visdom_info, debug):
+        visdom_info = {} if visdom_info is None else visdom_info
+        self.pause_mode = False
+        self.step = False
+        self.next_seq = False
+        if debug > 0 and visdom_info.get('use_visdom', True):
+            try:
+                self.visdom = Visdom(debug, {'handler': self._visdom_ui_handler, 'win_id': 'Tracking'},
+                                     visdom_info=visdom_info)
+
+                # # Show help
+                # help_text = 'You can pause/unpause the tracker by pressing ''space'' with the ''Tracking'' window ' \
+                #             'selected. During paused mode, you can track for one frame by pressing the right arrow key.' \
+                #             'To enable/disable plotting of a data block, tick/untick the corresponding entry in ' \
+                #             'block list.'
+                # self.visdom.register(help_text, 'text', 1, 'Help')
+            except:
+                time.sleep(0.5)
+                print('!!! WARNING: Visdom could not start, so using matplotlib visualization instead !!!\n'
+                      '!!! Start Visdom in a separate terminal window by typing \'visdom\' !!!')
+
+    def _visdom_ui_handler(self, data):
+        if data['event_type'] == 'KeyPress':
+            if data['key'] == ' ':
+                self.pause_mode = not self.pause_mode
+
+            elif data['key'] == 'ArrowRight' and self.pause_mode:
+                self.step = True
+
+            elif data['key'] == 'n':
+                self.next_seq = True
--- a/lib/test/tracker/data_utils.py
+++ b/lib/test/tracker/data_utils.py
@@ -0,0 +1,46 @@
+import torch
+import numpy as np
+from lib.utils.misc import NestedTensor
+
+
+class Preprocessor(object):
+    def __init__(self):
+        self.mean = torch.tensor([0.485, 0.456, 0.406]).view((1, 3, 1, 1)).cuda()
+        self.std = torch.tensor([0.229, 0.224, 0.225]).view((1, 3, 1, 1)).cuda()
+
+    def process(self, img_arr: np.ndarray, amask_arr: np.ndarray):
+        # Deal with the image patch
+        img_tensor = torch.tensor(img_arr).cuda().float().permute((2,0,1)).unsqueeze(dim=0)
+        img_tensor_norm = ((img_tensor / 255.0) - self.mean) / self.std  # (1,3,H,W)
+        # Deal with the attention mask
+        amask_tensor = torch.from_numpy(amask_arr).to(torch.bool).cuda().unsqueeze(dim=0)  # (1,H,W)
+        return NestedTensor(img_tensor_norm, amask_tensor)
+
+
+class PreprocessorX(object):
+    def __init__(self):
+        self.mean = torch.tensor([0.485, 0.456, 0.406]).view((1, 3, 1, 1)).cuda()
+        self.std = torch.tensor([0.229, 0.224, 0.225]).view((1, 3, 1, 1)).cuda()
+
+    def process(self, img_arr: np.ndarray, amask_arr: np.ndarray):
+        # Deal with the image patch
+        img_tensor = torch.tensor(img_arr).cuda().float().permute((2,0,1)).unsqueeze(dim=0)
+        img_tensor_norm = ((img_tensor / 255.0) - self.mean) / self.std  # (1,3,H,W)
+        # Deal with the attention mask
+        amask_tensor = torch.from_numpy(amask_arr).to(torch.bool).cuda().unsqueeze(dim=0)  # (1,H,W)
+        return img_tensor_norm, amask_tensor
+
+
+class PreprocessorX_onnx(object):
+    def __init__(self):
+        self.mean = np.array([0.485, 0.456, 0.406]).reshape((1, 3, 1, 1))
+        self.std = np.array([0.229, 0.224, 0.225]).reshape((1, 3, 1, 1))
+
+    def process(self, img_arr: np.ndarray, amask_arr: np.ndarray):
+        """img_arr: (H,W,3), amask_arr: (H,W)"""
+        # Deal with the image patch
+        img_arr_4d = img_arr[np.newaxis, :, :, :].transpose(0, 3, 1, 2)
+        img_arr_4d = (img_arr_4d / 255.0 - self.mean) / self.std  # (1, 3, H, W)
+        # Deal with the attention mask
+        amask_arr_3d = amask_arr[np.newaxis, :, :]  # (1,H,W)
+        return img_arr_4d.astype(np.float32), amask_arr_3d.astype(np.bool)
--- a/lib/test/tracker/vis_utils.py
+++ b/lib/test/tracker/vis_utils.py
@@ -0,0 +1,59 @@
+import numpy as np
+
+
+############## used for visulize eliminated tokens #################
+def get_keep_indices(decisions):
+    keep_indices = []
+    for i in range(3):
+        if i == 0:
+            keep_indices.append(decisions[i])
+        else:
+            keep_indices.append(keep_indices[-1][decisions[i]])
+    return keep_indices
+
+
+def gen_masked_tokens(tokens, indices, alpha=0.2):
+    # indices = [i for i in range(196) if i not in indices]
+    indices = indices[0].astype(int)
+    tokens = tokens.copy()
+    tokens[indices] = alpha * tokens[indices] + (1 - alpha) * 255
+    return tokens
+
+
+def recover_image(tokens, H, W, Hp, Wp, patch_size):
+    # image: (C, 196, 16, 16)
+    image = tokens.reshape(Hp, Wp, patch_size, patch_size, 3).swapaxes(1, 2).reshape(H, W, 3)
+    return image
+
+
+def pad_img(img):
+    height, width, channels = img.shape
+    im_bg = np.ones((height, width + 8, channels)) * 255
+    im_bg[0:height, 0:width, :] = img
+    return im_bg
+
+
+def gen_visualization(image, mask_indices, patch_size=16):
+    # image [224, 224, 3]
+    # mask_indices, list of masked token indices
+
+    # mask mask_indices need to cat
+    # mask_indices = mask_indices[::-1]
+    num_stages = len(mask_indices)
+    for i in range(1, num_stages):
+        mask_indices[i] = np.concatenate([mask_indices[i-1], mask_indices[i]], axis=1)
+
+    # keep_indices = get_keep_indices(decisions)
+    image = np.asarray(image)
+    H, W, C = image.shape
+    Hp, Wp = H // patch_size, W // patch_size
+    image_tokens = image.reshape(Hp, patch_size, Wp, patch_size, 3).swapaxes(1, 2).reshape(Hp * Wp, patch_size, patch_size, 3)
+
+    stages = [
+        recover_image(gen_masked_tokens(image_tokens, mask_indices[i]), H, W, Hp, Wp, patch_size)
+        for i in range(num_stages)
+    ]
+    imgs = [image] + stages
+    imgs = [pad_img(img) for img in imgs]
+    viz = np.concatenate(imgs, axis=1)
+    return viz