add readme (#10)

* Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * remove submodule * add mPLUG MiniGPT4 * Update Readme.md * Update Readme.md * Update Readme.md --------- Co-authored-by: Yuliang Liu <34134635+Yuliang-Liu@users.noreply.github.com>
2023-06-01 09:57:03 +08:00
parent 64f7eb334d
commit 3213a65d96
275 changed files with 16059 additions and 6 deletions
--- a/models/mPLUG_Owl/pipeline/data_utils/init.py
+++ b/models/mPLUG_Owl/pipeline/data_utils/init.py
@@ -0,0 +1,24 @@
+from .processors.builder import build_processors
+from .xgpt3_dataset import MultiModalDataset
+
+def train_valid_test_datasets_provider(data_path, config, tokenizer, seq_length=1024):
+    """Build train and valid datasets."""
+    print('> building train and validation datasets for mPLUG-Owl ...')
+    train_ds, valid_ds = build_train_valid_test_datasets(
+        input_file=data_path,  
+        tokenizer=tokenizer,
+        max_length=seq_length, 
+        config=config)  
+    print("> finished creating mPLUG-Owl datasets ...")
+
+    return train_ds, valid_ds
+
+def build_train_valid_test_datasets(input_file, tokenizer, max_length=80, config=None):
+    train_processors = build_processors(config['train_processors'])
+    valid_processors = build_processors(config['valid_processors'])
+
+    assert len(input_file) == 2 # If you have files more than 2, modify code at here or merger them into train and dev
+    train_ds = MultiModalDataset(input_file[0], tokenizer, train_processors, max_length)
+    valid_ds = MultiModalDataset(input_file[1], tokenizer, valid_processors, max_length)
+    test_ds = None
+    return (train_ds, valid_ds)