add readme (#10)
* Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * remove submodule * add mPLUG MiniGPT4 * Update Readme.md * Update Readme.md * Update Readme.md --------- Co-authored-by: Yuliang Liu <34134635+Yuliang-Liu@users.noreply.github.com>
This commit is contained in:
24
models/mPLUG_Owl/pipeline/data_utils/__init__.py
Normal file
24
models/mPLUG_Owl/pipeline/data_utils/__init__.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from .processors.builder import build_processors
|
||||
from .xgpt3_dataset import MultiModalDataset
|
||||
|
||||
def train_valid_test_datasets_provider(data_path, config, tokenizer, seq_length=1024):
|
||||
"""Build train and valid datasets."""
|
||||
print('> building train and validation datasets for mPLUG-Owl ...')
|
||||
train_ds, valid_ds = build_train_valid_test_datasets(
|
||||
input_file=data_path,
|
||||
tokenizer=tokenizer,
|
||||
max_length=seq_length,
|
||||
config=config)
|
||||
print("> finished creating mPLUG-Owl datasets ...")
|
||||
|
||||
return train_ds, valid_ds
|
||||
|
||||
def build_train_valid_test_datasets(input_file, tokenizer, max_length=80, config=None):
|
||||
train_processors = build_processors(config['train_processors'])
|
||||
valid_processors = build_processors(config['valid_processors'])
|
||||
|
||||
assert len(input_file) == 2 # If you have files more than 2, modify code at here or merger them into train and dev
|
||||
train_ds = MultiModalDataset(input_file[0], tokenizer, train_processors, max_length)
|
||||
valid_ds = MultiModalDataset(input_file[1], tokenizer, valid_processors, max_length)
|
||||
test_ds = None
|
||||
return (train_ds, valid_ds)
|
Reference in New Issue
Block a user