init commit of samurai
This commit is contained in:
90
lib/train/dataset/imagenetvid_lmdb.py
Normal file
90
lib/train/dataset/imagenetvid_lmdb.py
Normal file
@@ -0,0 +1,90 @@
|
||||
import os
|
||||
from .base_video_dataset import BaseVideoDataset
|
||||
from lib.train.data import jpeg4py_loader
|
||||
import torch
|
||||
from collections import OrderedDict
|
||||
from lib.train.admin import env_settings
|
||||
from lib.utils.lmdb_utils import decode_img, decode_json
|
||||
|
||||
|
||||
def get_target_to_image_ratio(seq):
|
||||
anno = torch.Tensor(seq['anno'])
|
||||
img_sz = torch.Tensor(seq['image_size'])
|
||||
return (anno[0, 2:4].prod() / (img_sz.prod())).sqrt()
|
||||
|
||||
|
||||
class ImagenetVID_lmdb(BaseVideoDataset):
|
||||
""" Imagenet VID dataset.
|
||||
|
||||
Publication:
|
||||
ImageNet Large Scale Visual Recognition Challenge
|
||||
Olga Russakovsky, Jia Deng, Hao Su, Jonathan Krause, Sanjeev Satheesh, Sean Ma, Zhiheng Huang, Andrej Karpathy,
|
||||
Aditya Khosla, Michael Bernstein, Alexander C. Berg and Li Fei-Fei
|
||||
IJCV, 2015
|
||||
https://arxiv.org/pdf/1409.0575.pdf
|
||||
|
||||
Download the dataset from http://image-net.org/
|
||||
"""
|
||||
def __init__(self, root=None, image_loader=jpeg4py_loader, min_length=0, max_target_area=1):
|
||||
"""
|
||||
args:
|
||||
root - path to the imagenet vid dataset.
|
||||
image_loader (default_image_loader) - The function to read the images. If installed,
|
||||
jpeg4py (https://github.com/ajkxyz/jpeg4py) is used by default. Else,
|
||||
opencv's imread is used.
|
||||
min_length - Minimum allowed sequence length.
|
||||
max_target_area - max allowed ratio between target area and image area. Can be used to filter out targets
|
||||
which cover complete image.
|
||||
"""
|
||||
root = env_settings().imagenet_dir if root is None else root
|
||||
super().__init__("imagenetvid_lmdb", root, image_loader)
|
||||
|
||||
sequence_list_dict = decode_json(root, "cache.json")
|
||||
self.sequence_list = sequence_list_dict
|
||||
|
||||
# Filter the sequences based on min_length and max_target_area in the first frame
|
||||
self.sequence_list = [x for x in self.sequence_list if len(x['anno']) >= min_length and
|
||||
get_target_to_image_ratio(x) < max_target_area]
|
||||
|
||||
def get_name(self):
|
||||
return 'imagenetvid_lmdb'
|
||||
|
||||
def get_num_sequences(self):
|
||||
return len(self.sequence_list)
|
||||
|
||||
def get_sequence_info(self, seq_id):
|
||||
bb_anno = torch.Tensor(self.sequence_list[seq_id]['anno'])
|
||||
valid = (bb_anno[:, 2] > 0) & (bb_anno[:, 3] > 0)
|
||||
visible = torch.ByteTensor(self.sequence_list[seq_id]['target_visible']) & valid.byte()
|
||||
return {'bbox': bb_anno, 'valid': valid, 'visible': visible}
|
||||
|
||||
def _get_frame(self, sequence, frame_id):
|
||||
set_name = 'ILSVRC2015_VID_train_{:04d}'.format(sequence['set_id'])
|
||||
vid_name = 'ILSVRC2015_train_{:08d}'.format(sequence['vid_id'])
|
||||
frame_number = frame_id + sequence['start_frame']
|
||||
frame_path = os.path.join('Data', 'VID', 'train', set_name, vid_name,
|
||||
'{:06d}.JPEG'.format(frame_number))
|
||||
return decode_img(self.root, frame_path)
|
||||
|
||||
def get_frames(self, seq_id, frame_ids, anno=None):
|
||||
sequence = self.sequence_list[seq_id]
|
||||
|
||||
frame_list = [self._get_frame(sequence, f) for f in frame_ids]
|
||||
|
||||
if anno is None:
|
||||
anno = self.get_sequence_info(seq_id)
|
||||
|
||||
# Create anno dict
|
||||
anno_frames = {}
|
||||
for key, value in anno.items():
|
||||
anno_frames[key] = [value[f_id, ...].clone() for f_id in frame_ids]
|
||||
|
||||
# added the class info to the meta info
|
||||
object_meta = OrderedDict({'object_class': sequence['class_name'],
|
||||
'motion_class': None,
|
||||
'major_class': None,
|
||||
'root_class': None,
|
||||
'motion_adverb': None})
|
||||
|
||||
return frame_list, anno_frames, object_meta
|
||||
|
Reference in New Issue
Block a user